[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]

[tor-commits] [tor-browser] 160/311: Bug 1758482 - Update dav1d to new version 28a9c46e1c36540d3276299f2e284ece1d2386be from 2022-02-04T23:02:17.000-03:00. r=media-playback-reviewers, padenot a=dmeehan



This is an automated email from the git hooks/post-receive script.

pierov pushed a commit to branch geckoview-99.0.1-11.0-1
in repository tor-browser.

commit 0c755a24b73056a9381d7cf3cd2b38281dbe4c43
Author: Jon Bauman <jbauman@xxxxxxxxxxx>
AuthorDate: Mon Mar 14 17:17:59 2022 +0000

    Bug 1758482 - Update dav1d to new version 28a9c46e1c36540d3276299f2e284ece1d2386be from 2022-02-04T23:02:17.000-03:00. r=media-playback-reviewers,padenot a=dmeehan
    
    Normally updatebot would create a revision, allowing me to review it single-handedly, but https://phabricator.services.mozilla.com/D140519 was a failure because of changes updatebot didn't know how to handle, so I just need **someone** to approve the update to get this landed. This has a bit of priority since it's blocking https://bugzilla.mozilla.org/show_bug.cgi?id=1757971, which we want to get uplifted for Fx99.
    
    Differential Revision: https://phabricator.services.mozilla.com/D140921
---
 media/libdav1d/README_MOZILLA                      |    3 +-
 media/libdav1d/asm/moz.build                       |   30 +-
 media/libdav1d/moz.build                           |    6 +-
 media/libdav1d/moz.yaml                            |    4 +-
 media/libdav1d/vcs_version.h                       |    2 +-
 media/libdav1d/version.h                           |    4 +-
 third_party/dav1d/NEWS                             |   29 +
 third_party/dav1d/README.md                        |   12 +-
 third_party/dav1d/THANKS.md                        |   28 +-
 third_party/dav1d/include/dav1d/common.h           |    5 +
 third_party/dav1d/include/dav1d/dav1d.h            |   13 +
 third_party/dav1d/meson.build                      |    8 +-
 third_party/dav1d/src/arm/32/filmgrain.S           | 2039 +++++++++++++++++++
 third_party/dav1d/src/arm/32/filmgrain16.S         | 2137 ++++++++++++++++++++
 third_party/dav1d/src/arm/32/itx.S                 |  358 ++--
 third_party/dav1d/src/arm/64/filmgrain.S           | 2010 ++++++++++++++++++
 third_party/dav1d/src/arm/64/filmgrain16.S         | 1997 ++++++++++++++++++
 third_party/dav1d/src/arm/64/itx.S                 |  282 +--
 third_party/dav1d/src/arm/64/looprestoration.S     |    4 +
 third_party/dav1d/src/arm/64/looprestoration16.S   |    4 +
 third_party/dav1d/src/arm/asm.S                    |  115 +-
 ...ilm_grain_init_tmpl.c => filmgrain_init_tmpl.c} |    2 +-
 third_party/dav1d/src/data.c                       |   13 +-
 third_party/dav1d/src/data.h                       |    1 +
 third_party/dav1d/src/decode.c                     |   20 +-
 third_party/dav1d/src/ext/x86/x86inc.asm           |   35 +-
 third_party/dav1d/src/fg_apply.h                   |   25 +-
 third_party/dav1d/src/fg_apply_tmpl.c              |  140 +-
 third_party/dav1d/src/filmgrain.h                  |   86 +
 third_party/dav1d/src/filmgrain_tmpl.c             |  433 ++++
 third_party/dav1d/src/internal.h                   |   53 +-
 third_party/dav1d/src/lib.c                        |   92 +-
 third_party/dav1d/src/meson.build                  |   26 +-
 third_party/dav1d/src/obu.c                        |   44 +-
 third_party/dav1d/src/picture.c                    |    1 +
 third_party/dav1d/src/tables.c                     |    2 +-
 third_party/dav1d/src/thread_task.c                |  137 +-
 third_party/dav1d/src/thread_task.h                |    2 +
 third_party/dav1d/src/x86/cdef16_avx2.asm          |    8 -
 third_party/dav1d/src/x86/cdef16_sse.asm           |    8 -
 ...{film_grain16_avx2.asm => filmgrain16_avx2.asm} |   31 +-
 .../{film_grain16_sse.asm => filmgrain16_sse.asm}  |   31 +-
 .../{film_grain_avx2.asm => filmgrain_avx2.asm}    |   48 +-
 third_party/dav1d/src/x86/filmgrain_avx512.asm     | 1079 ++++++++++
 third_party/dav1d/src/x86/filmgrain_common.asm     |   46 +
 ...ilm_grain_init_tmpl.c => filmgrain_init_tmpl.c} |   53 +-
 .../x86/{film_grain_sse.asm => filmgrain_sse.asm}  |   31 +-
 third_party/dav1d/src/x86/ipred16_avx2.asm         |   10 +-
 third_party/dav1d/src/x86/ipred16_avx512.asm       |  833 ++++++++
 third_party/dav1d/src/x86/ipred16_sse.asm          |    8 -
 third_party/dav1d/src/x86/ipred_init_tmpl.c        |    2 +-
 third_party/dav1d/src/x86/itx16_avx2.asm           |    8 -
 third_party/dav1d/src/x86/itx16_sse.asm            |    8 -
 third_party/dav1d/src/x86/itx_avx2.asm             |    9 -
 third_party/dav1d/src/x86/itx_avx512.asm           |    9 -
 third_party/dav1d/src/x86/itx_init_tmpl.c          |   22 +-
 third_party/dav1d/src/x86/itx_sse.asm              |  340 ++--
 third_party/dav1d/src/x86/loopfilter16_avx2.asm    |    8 -
 third_party/dav1d/src/x86/loopfilter16_sse.asm     |    8 -
 .../dav1d/src/x86/looprestoration16_avx2.asm       |    8 -
 third_party/dav1d/src/x86/mc16_avx2.asm            |    8 -
 third_party/dav1d/src/x86/mc16_avx512.asm          |    8 -
 third_party/dav1d/src/x86/mc16_sse.asm             |   12 +-
 third_party/dav1d/src/x86/mc_avx512.asm            |    8 -
 third_party/dav1d/tests/checkasm/filmgrain.c       |   10 +-
 65 files changed, 11853 insertions(+), 1003 deletions(-)

diff --git a/media/libdav1d/README_MOZILLA b/media/libdav1d/README_MOZILLA
index eba613b2969da..bc5c2708696e1 100644
--- a/media/libdav1d/README_MOZILLA
+++ b/media/libdav1d/README_MOZILLA
@@ -39,7 +39,8 @@ The rough steps are:
   moz.build which has a condition on CONFIG['CPU_ARCH'].
 - Files ending in _tmpl.c may be automatically added to SOURCES, resulting in build failures.
   To fix this, the file must be moved to the appropriate bitdepth_basenames list where
-  generate_source.py will generate the templates into buildable source files.
+  generate_source.py will generate the templates into buildable source files. In general,
+  all *_tmpl.c files require BITDEPTH to be defined.
 - Clone the tag from the dav1d repo and build a stand-alone libdav1d following the steps here:
   https://code.videolan.org/videolan/dav1d#compile
 - Copy vcs_version.h from the local build/include/vcs_version.h
diff --git a/media/libdav1d/asm/moz.build b/media/libdav1d/asm/moz.build
index 02647876d4904..7530c087ce30a 100644
--- a/media/libdav1d/asm/moz.build
+++ b/media/libdav1d/asm/moz.build
@@ -87,8 +87,14 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
             '../../../third_party/dav1d/src/x86/cdef16_avx2.asm', # moved from autovendored
             '../../../third_party/dav1d/src/x86/cdef_avx2.asm',
             '../../../third_party/dav1d/src/x86/cdef_avx512.asm',
-            '../../../third_party/dav1d/src/x86/film_grain16_avx2.asm',
-            '../../../third_party/dav1d/src/x86/film_grain_avx2.asm',
+            '../../../third_party/dav1d/src/x86/filmgrain16_avx2.asm',
+            '../../../third_party/dav1d/src/x86/filmgrain_avx2.asm',
+            '../../../third_party/dav1d/src/x86/filmgrain_avx512.asm',
+            # '../../../third_party/dav1d/src/x86/filmgrain_common.asm',
+            # ^ filmgrain_common.asm must *not* be in SOURCES because it's only
+            # used as an %include for other .asm files. Trying to assemble it
+            # will result in a nasm error: parser: instruction expected
+            '../../../third_party/dav1d/src/x86/ipred16_avx512.asm',
             '../../../third_party/dav1d/src/x86/ipred_avx2.asm',
             '../../../third_party/dav1d/src/x86/ipred_avx512.asm',
             '../../../third_party/dav1d/src/x86/itx16_avx2.asm',
@@ -111,8 +117,8 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
         '../../../third_party/dav1d/src/x86/cdef16_sse.asm',
         '../../../third_party/dav1d/src/x86/cdef_sse.asm',
         '../../../third_party/dav1d/src/x86/cpuid.asm',
-        '../../../third_party/dav1d/src/x86/film_grain16_sse.asm',
-        '../../../third_party/dav1d/src/x86/film_grain_sse.asm',
+        '../../../third_party/dav1d/src/x86/filmgrain16_sse.asm',
+        '../../../third_party/dav1d/src/x86/filmgrain_sse.asm',
         '../../../third_party/dav1d/src/x86/ipred16_avx2.asm',
         '../../../third_party/dav1d/src/x86/ipred16_sse.asm',
         '../../../third_party/dav1d/src/x86/ipred_sse.asm',
@@ -129,10 +135,12 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
     ]
 
     # BITDEPTH
+    # All the files here should be *_tmpl.c, and the should not appear in SOURCES,
+    # since they require BITDEPTH to be defined
     relative_path = '../../../third_party/dav1d/src/x86/'
     bitdepth_basenames = [
         'cdef_init_tmpl.c',
-        'film_grain_init_tmpl.c',
+        'filmgrain_init_tmpl.c',
         'ipred_init_tmpl.c',
         'itx_init_tmpl.c',
         'loopfilter_init_tmpl.c',
@@ -167,10 +175,12 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64':
     ]
 
     # BITDEPTH c file
+    # All the files here should be *_tmpl.c, and the should not appear in SOURCES,
+    # since they require BITDEPTH to be defined
     relative_path = '../../../third_party/dav1d/src/arm/'
     bitdepth_basenames = [
         'cdef_init_tmpl.c',
-        'film_grain_init_tmpl.c',
+        'filmgrain_init_tmpl.c',
         'ipred_init_tmpl.c',
         'itx_init_tmpl.c',
         'loopfilter_init_tmpl.c',
@@ -198,8 +208,8 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64':
             '../../../third_party/dav1d/src/arm/64/cdef.S',
             '../../../third_party/dav1d/src/arm/64/cdef16.S',
             '../../../third_party/dav1d/src/arm/64/cdef_tmpl.S',
-            '../../../third_party/dav1d/src/arm/64/film_grain.S',
-            '../../../third_party/dav1d/src/arm/64/film_grain16.S',
+            '../../../third_party/dav1d/src/arm/64/filmgrain.S',
+            '../../../third_party/dav1d/src/arm/64/filmgrain16.S',
             '../../../third_party/dav1d/src/arm/64/ipred.S',
             '../../../third_party/dav1d/src/arm/64/ipred16.S',
             # itx.S is used for both 8 and 16 bpc.
@@ -221,8 +231,8 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64':
             '../../../third_party/dav1d/src/arm/32/cdef.S',
             '../../../third_party/dav1d/src/arm/32/cdef16.S',
             '../../../third_party/dav1d/src/arm/32/cdef_tmpl.S',
-            '../../../third_party/dav1d/src/arm/32/film_grain.S',
-            '../../../third_party/dav1d/src/arm/32/film_grain16.S',
+            '../../../third_party/dav1d/src/arm/32/filmgrain.S',
+            '../../../third_party/dav1d/src/arm/32/filmgrain16.S',
             '../../../third_party/dav1d/src/arm/32/ipred.S',
             '../../../third_party/dav1d/src/arm/32/ipred16.S',
             '../../../third_party/dav1d/src/arm/32/itx.S',
diff --git a/media/libdav1d/moz.build b/media/libdav1d/moz.build
index c84cef8802b8e..923dca05ebead 100644
--- a/media/libdav1d/moz.build
+++ b/media/libdav1d/moz.build
@@ -103,7 +103,7 @@ EXPORTS.dav1d.src += [
     '../../third_party/dav1d/src/data.h',
     '../../third_party/dav1d/src/decode.h',
     '../../third_party/dav1d/src/dequant_tables.h',
-    '../../third_party/dav1d/src/film_grain.h',
+    '../../third_party/dav1d/src/filmgrain.h',
     '../../third_party/dav1d/src/getbits.h',
     '../../third_party/dav1d/src/intra_edge.h',
     '../../third_party/dav1d/src/lf_mask.h',
@@ -123,12 +123,14 @@ EXPORTS.dav1d.src += [
 ]
 
 # common BITDEPTH 8, 16
+# All the files here should be *_tmpl.c, and the should not appear in SOURCES,
+# since they require BITDEPTH to be defined
 relative_path = '../../third_party/dav1d/src/'
 bitdepth_basenames = [
     'cdef_apply_tmpl.c',
     'cdef_tmpl.c',
     'fg_apply_tmpl.c',
-    'film_grain_tmpl.c',
+    'filmgrain_tmpl.c',
     'ipred_prepare_tmpl.c',
     'ipred_tmpl.c',
     'itx_tmpl.c',
diff --git a/media/libdav1d/moz.yaml b/media/libdav1d/moz.yaml
index 2de4cdc699621..623ce9625d4de 100644
--- a/media/libdav1d/moz.yaml
+++ b/media/libdav1d/moz.yaml
@@ -20,11 +20,11 @@ origin:
 
   # Human-readable identifier for this version/release
   # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: commit 1f09a9119fb794ab41b1e527d848c2a210ca43d4 (2022-02-04T23:02:17.000-03:00).
+  release: commit 28a9c46e1c36540d3276299f2e284ece1d2386be (2022-03-10T14:49:23.000+00:00).
 
   # Revision to pull in
   # Must be a long or short commit SHA (long preferred)
-  revision: 1f09a9119fb794ab41b1e527d848c2a210ca43d4
+  revision: 28a9c46e1c36540d3276299f2e284ece1d2386be
 
   # The package's license, where possible using the mnemonic from
   # https://spdx.org/licenses/
diff --git a/media/libdav1d/vcs_version.h b/media/libdav1d/vcs_version.h
index 980b8d307dcaf..a449a2c0360d7 100644
--- a/media/libdav1d/vcs_version.h
+++ b/media/libdav1d/vcs_version.h
@@ -1,2 +1,2 @@
 /* auto-generated, do not edit */
-#define DAV1D_VERSION "1f09a9119fb794ab41b1e527d848c2a210ca43d4"
+#define DAV1D_VERSION "0.9.2-170-g28a9c46"
diff --git a/media/libdav1d/version.h b/media/libdav1d/version.h
index a797de5df28c5..5619d66bb76a1 100644
--- a/media/libdav1d/version.h
+++ b/media/libdav1d/version.h
@@ -27,8 +27,8 @@
 #ifndef DAV1D_VERSION_H
 #define DAV1D_VERSION_H
 
-#define DAV1D_API_VERSION_MAJOR 9
-#define DAV1D_API_VERSION_MINOR 2
+#define DAV1D_API_VERSION_MAJOR 6
+#define DAV1D_API_VERSION_MINOR 6
 #define DAV1D_API_VERSION_PATCH 0
 
 #endif /* DAV1D_VERSION_H */
diff --git a/third_party/dav1d/NEWS b/third_party/dav1d/NEWS
index 2a4c4d00ca637..6158d780487bd 100644
--- a/third_party/dav1d/NEWS
+++ b/third_party/dav1d/NEWS
@@ -1,3 +1,32 @@
+Changes for 1.0.0 'Peregrine falcon':
+-------------------------------------
+
+1.0.0 is a major release of dav1d, adding important features and bug fixes.
+
+It notably changes, in an important way, the way threading works, by adding
+an automatic thread management.
+
+It also adds support for AVX-512 acceleration, and adds speedups to existing x86
+code (from SSE2 to AVX2).
+
+1.0.0 adds new grain API to ease acceleration on the GPU.
+
+Finally, 1.0.0 fixes numerous small bugs that were reported since the beginning
+of the project to have a proper release.
+
+                                     .''.
+         .''.      .        *''*    :_\/_:     .
+        :_\/_:   _\(/_  .:.*_\/_*   : /\ :  .'.:.'.
+    .''.: /\ :   ./)\   ':'* /\ * :  '..'.  -=:o:=-
+   :_\/_:'.:::.    ' *''*    * '.\'/.' _\(/_'.':'.'
+   : /\ : :::::     *_\/_*     -= o =-  /)\    '  *
+    '..'  ':::'     * /\ *     .'/.\'.   '
+        *            *..*         :
+          *                       :
+          *         1.0.0
+
+
+
 Changes for 0.9.2 'Golden Eagle':
 ---------------------------------
 
diff --git a/third_party/dav1d/README.md b/third_party/dav1d/README.md
index 34a351e247f21..a8a35e1c6fe2d 100644
--- a/third_party/dav1d/README.md
+++ b/third_party/dav1d/README.md
@@ -2,11 +2,13 @@
 
 # dav1d
 
-**dav1d** is a new **AV1** cross-platform **d**ecoder, open-source, and focused on speed and correctness.
+**dav1d** is an **AV1** cross-platform **d**ecoder, open-source, and focused on speed and correctness.
+
+It is now battle-tested and production-ready and can be used everywhere.
 
 The canonical repository URL for this repo is https://code.videolan.org/videolan/dav1d
 
-This project is partially funded by the *Alliance for Open Media*/**AOM**.
+This project was partially funded by the *Alliance for Open Media*/**AOM**.
 
 ## Goal and Features
 
@@ -38,11 +40,11 @@ The plan is the following:
 9. Make high bit-depth fast on older mobile, by writing asm for ARMv7 chips,
 10. Make high bit-depth fast on desktop, by writing asm for AVX2 chips,
 11. Make high bit-depth fast on older desktop, by writing asm for SSSE3+ chips,
+12. Improve threading.
 
 ### On-going
-12. Improve C code base with [various tweaks](https://code.videolan.org/videolan/dav1d/wikis/task-list),
-13. Accelerate for less common architectures, like PPC, SSE2 or AVX-512.
-14. Improve threading.
+13. Improve C code base with [various tweaks](https://code.videolan.org/videolan/dav1d/wikis/task-list),
+14. Accelerate for less common architectures, like PPC, SSE2, RISC-V or AVX-512.
 
 ### After
 15. Use more GPU decoding, when possible.
diff --git a/third_party/dav1d/THANKS.md b/third_party/dav1d/THANKS.md
index 06d879853c3e6..2791e3dcc61bc 100644
--- a/third_party/dav1d/THANKS.md
+++ b/third_party/dav1d/THANKS.md
@@ -16,16 +16,18 @@ The Alliance for Open Media (AOM) for funding this project.
 
 And all the dav1d Authors (git shortlog -sn), including:
 
-Martin Storsjö, Janne Grunau, Henrik Gramner, Ronald S. Bultje, James Almer,
-Marvin Scholz, Luc Trudeau, Victorien Le Couviour--Tuffet, Jean-Baptiste Kempf,
-Hugo Beauzée-Luyssen, Matthias Dressel, Konstantin Pavlov, David Michael Barr,
-Steve Lhomme, Niklas Haas, B Krishnan Iyer, Francois Cartegnie, Liwei Wang,
-Nathan E. Egge, Derek Buitenhuis, Michael Bradshaw, Raphaël Zumer,
-Xuefeng Jiang, Luca Barbato, Jan Beich, Wan-Teh Chang, Justin Bull, Boyuan Xiao,
-Dale Curtis, Kyle Siefring, Raphael Zumer, Rupert Swarbrick, Thierry Foucu,
-Thomas Daede, Colin Lee, Emmanuel Gil Peyrot, Lynne, Michail Alvanos,
-Nico Weber, SmilingWolf, Tristan Laurent, Vittorio Giovara, Anisse Astier,
-Dmitriy Sychov, Ewout ter Hoeven, Fred Barbier, Jean-Yves Avenard,
-Mark Shuttleworth, Matthieu Bouron, Nicolas Frattaroli, Pablo Stebler,
-Rostislav Pehlivanov, Shiz, Steinar Midtskogen, Sylvestre Ledru, Timo Gurr,
-Tristan Matthews, Xavier Claessens, Xu Guangxin, kossh1 and skal.
+Martin Storsjö, Henrik Gramner, Ronald S. Bultje, Janne Grunau, James Almer,
+Victorien Le Couviour--Tuffet, Matthias Dressel, Marvin Scholz, Luc Trudeau,
+Jean-Baptiste Kempf, Hugo Beauzée-Luyssen, Niklas Haas, Konstantin Pavlov,
+David Michael Barr, Steve Lhomme, Nathan E. Egge, Kyle Siefring, Raphaël Zumer,
+B Krishnan Iyer, Francois Cartegnie, Liwei Wang, Derek Buitenhuis,
+Michael Bradshaw, Wan-Teh Chang, Xuefeng Jiang, Luca Barbato, Jan Beich,
+Christophe Gisquet, Justin Bull, Boyuan Xiao, Dale Curtis, Emmanuel Gil Peyrot,
+Rupert Swarbrick, Thierry Foucu, Thomas Daede, Colin Lee, Jonathan Wright,
+Lynne, Michail Alvanos, Nico Weber, Salome Thirot, SmilingWolf, Tristan Laurent,
+Vittorio Giovara, Yannis Guyon, André Kempe, Anisse Astier, Anton Mitrofanov,
+Dmitriy Sychov, Ewout ter Hoeven, Fred Barbier, Jean-Yves Avenard, Joe Drago,
+Mark Shuttleworth, Matthieu Bouron, Mehdi Sabwat, Nicolas Frattaroli,
+Pablo Stebler, Rostislav Pehlivanov, Shiz, Steinar Midtskogen, Sylvain BERTRAND,
+Sylvestre Ledru, Timo Gurr, Tristan Matthews, Vibhoothi, Xavier Claessens,
+Xu Guangxin, kossh1 and skal
diff --git a/third_party/dav1d/include/dav1d/common.h b/third_party/dav1d/include/dav1d/common.h
index b55e9399f06f6..8685b4f078285 100644
--- a/third_party/dav1d/include/dav1d/common.h
+++ b/third_party/dav1d/include/dav1d/common.h
@@ -78,4 +78,9 @@ typedef struct Dav1dDataProps {
     struct Dav1dUserData user_data; ///< user-configurable data, default NULL members
 } Dav1dDataProps;
 
+/**
+ * Release reference to a Dav1dDataProps.
+ */
+DAV1D_API void dav1d_data_props_unref(Dav1dDataProps *props);
+
 #endif /* DAV1D_COMMON_H */
diff --git a/third_party/dav1d/include/dav1d/dav1d.h b/third_party/dav1d/include/dav1d/dav1d.h
index 0938156f759d5..fd3b622505b7e 100644
--- a/third_party/dav1d/include/dav1d/dav1d.h
+++ b/third_party/dav1d/include/dav1d/dav1d.h
@@ -274,6 +274,19 @@ enum Dav1dEventFlags {
  */
 DAV1D_API int dav1d_get_event_flags(Dav1dContext *c, enum Dav1dEventFlags *flags);
 
+/**
+ * Retrieve the user-provided metadata associated with the input data packet
+ * for the last decoding error reported to the user, i.e. a negative return
+ * value (not EAGAIN) from dav1d_send_data() or dav1d_get_picture().
+ *
+ * @param   c Input decoder instance.
+ * @param out Output Dav1dDataProps. On success, the caller assumes ownership of
+ *            the returned reference.
+ *
+ * @return 0 on success, or < 0 (a negative DAV1D_ERR code) on error.
+ */
+DAV1D_API int dav1d_get_decode_error_data_props(Dav1dContext *c, Dav1dDataProps *out);
+
 # ifdef __cplusplus
 }
 # endif
diff --git a/third_party/dav1d/meson.build b/third_party/dav1d/meson.build
index 0e63d3bced698..5efb88c5d3fb4 100644
--- a/third_party/dav1d/meson.build
+++ b/third_party/dav1d/meson.build
@@ -30,7 +30,7 @@ project('dav1d', ['c'],
                       'b_ndebug=if-release'],
     meson_version: '>= 0.49.0')
 
-dav1d_soname_version       = '6.4.0'
+dav1d_soname_version       = '6.6.0'
 dav1d_api_version_array    = dav1d_soname_version.split('.')
 dav1d_api_version_major    = dav1d_api_version_array[0]
 dav1d_api_version_minor    = dav1d_api_version_array[1]
@@ -413,11 +413,7 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86')
 
     # check NASM version
     if nasm.found()
-        nasm_r = run_command(nasm, '-v')
-
-        if nasm_r.returncode() != 0
-            error('failed running nasm to obtain its version')
-        endif
+        nasm_r = run_command(nasm, '-v', check: true)
 
         out = nasm_r.stdout().strip().split()
         if out[1].to_lower() == 'version'
diff --git a/third_party/dav1d/src/arm/32/filmgrain.S b/third_party/dav1d/src/arm/32/filmgrain.S
new file mode 100644
index 0000000000000..d1f83efb98e01
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/filmgrain.S
@@ -0,0 +1,2039 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "src/arm/asm-offsets.h"
+
+#define GRAIN_WIDTH 82
+#define GRAIN_HEIGHT 73
+
+#define SUB_GRAIN_WIDTH 44
+#define SUB_GRAIN_HEIGHT 38
+
+.macro increment_seed steps, shift=1
+        lsr             r11, r2,  #3
+        lsr             r12, r2,  #12
+        lsr             lr,  r2,  #1
+        eor             r11, r2,  r11                     // (r >> 0) ^ (r >> 3)
+        eor             r12, r12, lr                      // (r >> 12) ^ (r >> 1)
+        eor             r11, r11, r12                     // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1)
+.if \shift
+        lsr             r2,  r2,  #\steps
+.endif
+        and             r11, r11, #((1 << \steps) - 1)    // bit
+.if \shift
+        orr             r2,  r2,  r11, lsl #(16 - \steps) // *state
+.else
+        orr             r2,  r2,  r11, lsl #16            // *state
+.endif
+.endm
+
+.macro read_rand dest, bits, age
+        ubfx            \dest,  r2,   #16 - \bits - \age, #\bits
+.endm
+
+.macro read_shift_rand dest, bits
+        ubfx            \dest,  r2,   #17 - \bits, #\bits
+        lsr             r2,  r2,  #1
+.endm
+
+// special calling convention:
+// r2 holds seed
+// r3 holds dav1d_gaussian_sequence
+// clobbers r11-r12
+// returns in d0-d1
+function get_gaussian_neon
+        push            {r5-r6,lr}
+        increment_seed  4
+        read_rand       r5,  11,  3
+        read_rand       r6,  11,  2
+        add             r5,  r3,  r5,  lsl #1
+        add             r6,  r3,  r6,  lsl #1
+        vld1.16         {d0[0]}, [r5]
+        read_rand       r5,  11,  1
+        vld1.16         {d0[1]}, [r6]
+        add             r5,  r3,  r5,  lsl #1
+        read_rand       r6, 11,  0
+        increment_seed  4
+        add             r6,  r3,  r6,  lsl #1
+        vld1.16         {d0[2]}, [r5]
+        read_rand       r5,  11,  3
+        vld1.16         {d0[3]}, [r6]
+        add             r5,  r3,  r5,  lsl #1
+        read_rand       r6,  11,  2
+        vld1.16         {d1[0]}, [r5]
+        add             r6,  r3,  r6,  lsl #1
+        read_rand       r5,  11,  1
+        vld1.16         {d1[1]}, [r6]
+        read_rand       r6,  11,  0
+        add             r5,  r3,  r5,  lsl #1
+        add             r6,  r3,  r6,  lsl #1
+        vld1.16         {d1[2]}, [r5]
+        vld1.16         {d1[3]}, [r6]
+        pop             {r5-r6,pc}
+endfunc
+
+.macro get_grain_row r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10
+        bl              get_gaussian_neon
+        vrshl.s16       q0,  q0,  q15
+        vmovn.i16       \r0, q0
+        bl              get_gaussian_neon
+        vrshl.s16       q0,  q0,  q15
+        vmovn.i16       \r1, q0
+        bl              get_gaussian_neon
+        vrshl.s16       q0,  q0,  q15
+        vmovn.i16       \r2, q0
+        bl              get_gaussian_neon
+        vrshl.s16       q0,  q0,  q15
+        vmovn.i16       \r3, q0
+        bl              get_gaussian_neon
+        vrshl.s16       q0,  q0,  q15
+        vmovn.i16       \r4, q0
+        bl              get_gaussian_neon
+        vrshl.s16       q0,  q0,  q15
+        vmovn.i16       \r5, q0
+        bl              get_gaussian_neon
+        vrshl.s16       q0,  q0,  q15
+        vmovn.i16       \r6, q0
+        bl              get_gaussian_neon
+        vrshl.s16       q0,  q0,  q15
+        vmovn.i16       \r7, q0
+        bl              get_gaussian_neon
+        vrshl.s16       q0,  q0,  q15
+        vmovn.i16       \r8, q0
+        bl              get_gaussian_neon
+        vrshl.s16       q0,  q0,  q15
+        vmovn.i16       \r9, q0
+        increment_seed  2
+        read_rand       r11, 11,  1
+        read_rand       r12, 11,  0
+        add             r11, r3,  r11, lsl #1
+        add             r12, r3,  r12, lsl #1
+        vld1.16         {d0[0]}, [r11]
+        vld1.16         {d0[1]}, [r12]
+        vrshl.s16       d0,  d0,  d30
+        vmovn.i16       \r10, q0
+.endm
+
+.macro store_grain_row r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10
+        vst1.16         {\r0, \r1, \r2, \r3}, [r0]!
+        vst1.16         {\r4, \r5, \r6, \r7}, [r0]!
+        vst1.16         {\r8, \r9},           [r0]!
+        vst1.16         {\r10[0]},            [r0]!
+.endm
+
+.macro get_grain_row_44 r0, r1, r2, r3, r4, r5
+        bl              get_gaussian_neon
+        vrshl.s16       q0,  q0,  q15
+        vmovn.i16       \r0, q0
+        bl              get_gaussian_neon
+        vrshl.s16       q0,  q0,  q15
+        vmovn.i16       \r1, q0
+        bl              get_gaussian_neon
+        vrshl.s16       q0,  q0,  q15
+        vmovn.i16       \r2, q0
+        bl              get_gaussian_neon
+        vrshl.s16       q0,  q0,  q15
+        vmovn.i16       \r3, q0
+        bl              get_gaussian_neon
+        vrshl.s16       q0,  q0,  q15
+        vmovn.i16       \r4, q0
+        increment_seed  4
+        read_rand       r11, 11,  3
+        read_rand       r12, 11,  2
+        add             r11, r3,  r11, lsl #1
+        add             r12, r3,  r12, lsl #1
+        vld1.16         {d0[]}, [r11]
+        read_rand       r11, 11,  1
+        vld1.16         {d0[1]}, [r12]
+        add             r11, r3,  r11, lsl #1
+        read_rand       r12, 11,  0
+        vld1.16         {d0[2]}, [r11]
+        add             r12, r3,  r12, lsl #1
+        vld1.16         {d0[3]}, [r12]
+        vrshl.s16       d0,  d0,  d30
+        vmovn.i16       \r5, q0
+.endm
+
+.macro store_grain_row_44 r0, r1, r2, r3, r4, r5
+        vst1.16         {\r0, \r1, \r2, \r3}, [r0]!
+        vst1.16         {\r4, \r5},           [r0]
+        add             r0,  r0,  #GRAIN_WIDTH-32
+.endm
+
+function get_grain_2_neon
+        push            {r11,lr}
+        increment_seed  2
+        read_rand       r11, 11,  1
+        read_rand       r12, 11,  0
+        add             r11, r3,  r11, lsl #1
+        add             r12, r3,  r12, lsl #1
+        vld1.16         {d0[0]}, [r11]
+        vld1.16         {d0[1]}, [r12]
+        vrshl.s16       d0,  d0,  d30
+        vmovn.i16       d0,  q0
+        pop             {r11,pc}
+endfunc
+
+.macro get_grain_2 dst
+        bl              get_grain_2_neon
+.ifnc \dst, d0
+        vmov            \dst, d0
+.endif
+.endm
+
+// r1 holds the number of entries to produce
+// r6, r8 and r10 hold the previous output entries
+// q0 holds the vector of produced entries
+// q1 holds the input vector of sums from above
+.macro output_lag n
+function output_lag\n\()_neon
+        push            {r0, lr}
+.if \n == 1
+        mov             lr,  #-128
+.else
+        mov             r0,  #1
+        mov             lr,  #1
+        sub             r7,  r7,  #1
+        sub             r9,  r9,  #1
+        lsl             r0,  r0,  r7
+        lsl             lr,  lr,  r9
+        add             r7,  r7,  #1
+        add             r9,  r9,  #1
+.endif
+1:
+        read_shift_rand r12, 11
+        vmov.32         r11, d2[0]
+        lsl             r12, r12, #1
+        vext.8          q0,  q0,  q0,  #1
+        ldrsh           r12, [r3, r12]
+.if \n == 1
+        mla             r11, r6,  r4,  r11        // sum (above) + *coeff * prev output
+        add             r6,  r11, r8              // 1 << (ar_coeff_shift - 1)
+        add             r12, r12, r10
+        asr             r6,  r6,  r7              // >> ar_coeff_shift
+        asr             r12, r12, r9              // >> (4 + grain_scale_shift)
+        add             r6,  r6,  r12
+        cmp             r6,  r5
+.elseif \n == 2
+        mla             r11, r8,  r4,  r11        // sum (above) + *coeff * prev output 1
+        mla             r11, r6,  r10, r11        // += *coeff * prev output 2
+        mov             r8,  r6
+        add             r6,  r11, r0              // 1 << (ar_coeff_shift - 1)
+        add             r12, r12, lr              // 1 << (4 + grain_scale_shift - 1)
+        asr             r6,  r6,  r7              // >> ar_coeff_shift
+        asr             r12, r12, r9              // >> (4 + grain_scale_shift)
+        add             r6,  r6,  r12
+        push            {lr}
+        cmp             r6,  r5
+        mov             lr,  #-128
+.else
+        push            {r1-r3}
+        sbfx            r1,  r4,  #0,  #8
+        sbfx            r2,  r4,  #8,  #8
+        sbfx            r3,  r4,  #16, #8
+        mla             r11, r10, r1,  r11        // sum (above) + *coeff * prev output 1
+        mla             r11, r8,  r2,  r11        // sum (above) + *coeff * prev output 2
+        mla             r11, r6,  r3,  r11        // += *coeff * prev output 3
+        pop             {r1-r3}
+        mov             r10, r8
+        mov             r8,  r6
+
+        add             r6,  r11, r0              // 1 << (ar_coeff_shift - 1)
+        add             r12, r12, lr              // 1 << (4 + grain_scale_shift - 1)
+        asr             r6,  r6,  r7              // >> ar_coeff_shift
+        asr             r12, r12, r9              // >> (4 + grain_scale_shift)
+        add             r6,  r6,  r12
+        push            {lr}
+        cmp             r6,  r5
+        mov             lr,  #-128
+.endif
+        it              gt
+        movgt           r6,  r5
+        cmp             r6,  lr
+        it              lt
+        movlt           r6,  lr
+.if \n >= 2
+        pop             {lr}
+.endif
+        subs            r1,  r1,  #1
+        vext.8          q1,  q1,  q1,  #4
+        vmov.8          d1[7], r6
+        bgt             1b
+        pop             {r0, pc}
+endfunc
+.endm
+
+output_lag 1
+output_lag 2
+output_lag 3
+
+
+function sum_lag1_above_neon
+        vmull.s8        q2,  d6,  d28
+        vmull.s8        q3,  d7,  d28
+        vmull.s8        q4,  d0,  d27
+        vmull.s8        q5,  d1,  d27
+
+        vaddl.s16       q0,  d4,  d8
+        vaddl.s16       q2,  d5,  d9
+        vaddl.s16       q4,  d6,  d10
+        vaddl.s16       q5,  d7,  d11
+
+        vmull.s8        q3,  d3,  d29
+        vmull.s8        q1,  d2,  d29
+
+        vaddw.s16       q4,  q4,  d6
+        vaddw.s16       q5,  q5,  d7
+        vaddw.s16       q3,  q2,  d3
+        vaddw.s16       q2,  q0,  d2
+        bx              lr
+endfunc
+
+.macro sum_lag_n_body lag, type, uv_layout, edge, elems, store, uv_coeff
+.ifc \lag\()_\edge, lag3_left
+        bl              sum_lag3_left_above_neon
+.else
+        bl              sum_\lag\()_above_neon
+.endif
+.ifc \type, uv_420
+        vpush           {q6-q7}
+        add             r12, r11, #GRAIN_WIDTH
+        vld1.16         {q0, q1}, [r11]!
+        vld1.16         {q6, q7}, [r12]!
+        vpaddl.s8       q0,  q0
+        vpaddl.s8       q1,  q1
+        vpaddl.s8       q6,  q6
+        vpaddl.s8       q7,  q7
+        vadd.i16        q0,  q0,  q6
+        vadd.i16        q1,  q1,  q7
+        vpop            {q6-q7}
+        vrshrn.s16      d0,  q0,  #2
+        vrshrn.s16      d1,  q1,  #2
+.endif
+.ifc \type, uv_422
+        vld1.8          {q0, q1}, [r11]!
+        vpaddl.s8       q0,  q0
+        vpaddl.s8       q1,  q1
+        vrshrn.s16      d0,  q0,  #1
+        vrshrn.s16      d1,  q1,  #1
+.endif
+.ifc \type, uv_444
+        vld1.8          {q0}, [r11]!
+.endif
+.if \uv_layout
+.ifnb \uv_coeff
+        vdup.8          d13, \uv_coeff
+.endif
+        vmull.s8        q1,  d0,  d13
+        vmull.s8        q0,  d1,  d13
+        vaddw.s16       q2,  q2,  d2
+        vaddw.s16       q3,  q3,  d3
+        vaddw.s16       q4,  q4,  d0
+        vaddw.s16       q5,  q5,  d1
+.endif
+.if \uv_layout && \elems == 16
+        b               sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 444 && \elems == 15
+        b               sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 422 && \elems == 9
+        b               sum_\lag\()_uv_420_\edge\()_start
+.else
+sum_\lag\()_\type\()_\edge\()_start:
+        push            {r11}
+.ifc \edge, left
+        increment_seed  4
+        read_rand       r11, 11,  3
+        read_rand       r12, 11,  2
+        add             r11, r3,  r11, lsl #1
+        add             r12, r3,  r12, lsl #1
+        vld1.16         {d1[1]}, [r11]
+        read_rand       r11, 11,  1
+        vld1.16         {d1[2]}, [r12]
+        add             r11, r3,  r11, lsl #1
+        vld1.16         {d1[3]}, [r11]
+        lsl             r2,  r2,  #1             // shift back the state as if we'd done increment_seed with shift=0
+        vrshl.s16       d1,  d1,  d30
+        vmovn.i16       d1,  q0
+        vext.8          q2,  q2,  q2,  #12
+.ifc \lag, lag3
+        vmov.s8         r10, d1[5]
+.endif
+.ifnc \lag, lag1
+        vmov.s8         r8,  d1[6]
+.endif
+        vmov.s8         r6,  d1[7]
+
+        vmov            q1,  q2
+        mov             r1,  #1
+        bl              output_\lag\()_neon
+.else
+        increment_seed  4, shift=0
+        vmov            q1,  q2
+        mov             r1,  #4
+        bl              output_\lag\()_neon
+.endif
+
+        increment_seed  4, shift=0
+        vmov            q1,  q3
+        mov             r1,  #4
+        bl              output_\lag\()_neon
+
+        increment_seed  4, shift=0
+        vmov            q1,  q4
+.if \elems == 9
+        mov             r1,  #1
+        bl              output_\lag\()_neon
+        lsr             r2,  r2,  #3
+
+        read_rand       r11, 11,  2
+        read_rand       r12, 11,  1
+        add             r11, r3,  r11, lsl #1
+        add             r12, r3,  r12, lsl #1
+        vld1.16         {d2[0]}, [r11]
+        read_rand       r11, 11,  0
+        vld1.16         {d2[1]}, [r12]
+        add             r11, r3,  r11, lsl #1
+        vld1.16         {d2[2]}, [r11]
+        vrshl.s16       d2,  d2,  d30
+        vmovn.i16       d2,  q1
+        vext.8          q0,  q0,  q1,  #7
+.else
+        mov             r1,  #4
+        bl              output_\lag\()_neon
+
+        increment_seed  4, shift=0
+        vmov            q1,  q5
+
+.ifc \edge, right
+        mov             r1,  #3
+        bl              output_\lag\()_neon
+        read_shift_rand r11, 11
+        add             r11, r3,  r11, lsl #1
+        vld1.16         {d2[0]}, [r11]
+        vrshl.s16       d2,  d2,  d30
+        vext.8          q0,  q0,  q1,  #1
+.else
+        mov             r1,  #4
+        bl              output_\lag\()_neon
+.endif
+.endif
+.if \store
+        vst1.8          {q0}, [r0]!
+.endif
+        pop             {r11}
+        pop             {r1, pc}
+.endif
+.endm
+
+.macro sum_lag1_func type, uv_layout, edge, elems=16
+function sum_\type\()_lag1_\edge\()_neon
+        push            {r1, lr}
+        sum_lag_n_body  lag1, \type, \uv_layout, \edge, \elems, store=0
+endfunc
+.endm
+
+sum_lag1_func y,      0,   left
+sum_lag1_func y,      0,   mid
+sum_lag1_func y,      0,   right, 15
+sum_lag1_func uv_444, 444, left
+sum_lag1_func uv_444, 444, mid
+sum_lag1_func uv_444, 444, right, 15
+sum_lag1_func uv_422, 422, left
+sum_lag1_func uv_422, 422, mid
+sum_lag1_func uv_422, 422, right, 9
+sum_lag1_func uv_420, 420, left
+sum_lag1_func uv_420, 420, mid
+sum_lag1_func uv_420, 420, right, 9
+
+.macro sum_lag1 type, dst, left, mid, right, edge=mid
+        vmov            q3,  \mid
+        vext.8          q0,  \left, \mid,   #15
+        vext.8          q1,  \mid,  \right, #1
+        bl              sum_\type\()_lag1_\edge\()_neon
+        vmov            \dst, q0
+.endm
+
+.macro sum_y_lag1 dst, left, mid, right, edge=mid
+        sum_lag1        y, \dst, \left, \mid, \right, \edge
+.endm
+
+.macro sum_uv_444_lag1 dst, left, mid, right, edge=mid
+        sum_lag1        uv_444, \dst, \left, \mid, \right, \edge
+.endm
+
+.macro sum_uv_422_lag1 dst, left, mid, right, edge=mid
+        sum_lag1        uv_422, \dst, \left, \mid, \right, \edge
+.endm
+
+.macro sum_uv_420_lag1 dst, left, mid, right, edge=mid
+        sum_lag1        uv_420, \dst, \left, \mid, \right, \edge
+.endm
+
+
+function sum_lag2_above_neon
+        push            {lr}
+        sub             r12, r0,  #2*GRAIN_WIDTH - 16
+        sub             lr,  r0,  #1*GRAIN_WIDTH - 16
+        vld1.8          {q10}, [r12] // load top right
+        vld1.8          {q13}, [lr]
+
+        vext.8          q6,  q8,  q9,  #14 // top left, top mid
+        vdup.8          d14, d28[0]
+        vext.8          q8,  q8,  q9,  #15
+        vdup.8          d15, d28[1]
+
+        vmull.s8        q0,  d12, d14
+        vmull.s8        q1,  d13, d14
+        vmull.s8        q6,  d16, d15
+        vmull.s8        q8,  d17, d15
+
+        vaddl.s16       q2,  d0,  d12
+        vaddl.s16       q3,  d1,  d13
+        vaddl.s16       q4,  d2,  d16
+        vaddl.s16       q5,  d3,  d17
+
+        vext.8          q6,  q9,  q10, #1  // top mid, top right
+        vdup.8          d14, d28[3]
+        vext.8          q8,  q9,  q10, #2
+        vdup.8          d15, d28[4]
+
+        vmull.s8        q0,  d12, d14
+        vmull.s8        q1,  d13, d14
+        vmull.s8        q6,  d16, d15
+        vmull.s8        q8,  d17, d15
+
+        vaddl.s16       q7,  d0,  d12
+        vaddl.s16       q0,  d1,  d13
+        vaddl.s16       q6,  d2,  d16
+        vaddl.s16       q1,  d3,  d17
+
+        vadd.i32        q2,  q2,  q7
+        vadd.i32        q3,  q3,  q0
+        vadd.i32        q4,  q4,  q6
+        vadd.i32        q5,  q5,  q1
+
+        vext.8          q6,  q11, q12, #14 // top left, top mid
+        vdup.8          d14, d28[5]
+        vext.8          q8,  q11, q12, #15
+        vdup.8          d15, d28[6]
+
+        vmull.s8        q0,  d12, d14
+        vmull.s8        q1,  d13, d14
+        vmull.s8        q6,  d16, d15
+        vmull.s8        q8,  d17, d15
+
+        vaddl.s16       q7,  d0,  d12
+        vaddl.s16       q0,  d1,  d13
+        vaddl.s16       q6,  d2,  d16
+        vaddl.s16       q1,  d3,  d17
+
+        vadd.i32        q2,  q2,  q7
+        vadd.i32        q3,  q3,  q0
+        vadd.i32        q4,  q4,  q6
+        vadd.i32        q5,  q5,  q1
+
+        vext.8          q6,  q12, q13, #1  // top mid, top right
+        vdup.8          d14, d29[0]
+        vext.8          q8,  q12, q13, #2
+        vdup.8          d15, d29[1]
+
+        vmull.s8        q0,  d12, d14
+        vmull.s8        q1,  d13, d14
+        vmull.s8        q6,  d16, d15
+        vmull.s8        q8,  d17, d15
+
+        vaddl.s16       q7,  d0,  d12
+        vaddl.s16       q0,  d1,  d13
+        vaddl.s16       q6,  d2,  d16
+        vaddl.s16       q1,  d3,  d17
+
+        vadd.i32        q2,  q2,  q7
+        vadd.i32        q3,  q3,  q0
+        vadd.i32        q4,  q4,  q6
+        vadd.i32        q5,  q5,  q1
+
+        vdup.8          d14, d28[2]
+        vdup.8          d15, d28[7]
+
+        vmull.s8        q0,  d18, d14
+        vmull.s8        q1,  d19, d14
+        vmull.s8        q6,  d24, d15
+        vmull.s8        q8,  d25, d15
+
+        vaddl.s16       q7,  d0,  d12
+        vaddl.s16       q0,  d1,  d13
+        vaddl.s16       q6,  d2,  d16
+        vaddl.s16       q1,  d3,  d17
+
+        vmov            q8,  q9
+        vmov            q9,  q10
+
+        vadd.i32        q2,  q2,  q7
+        vadd.i32        q3,  q3,  q0
+        vadd.i32        q4,  q4,  q6
+        vadd.i32        q5,  q5,  q1
+
+        vmov            q11, q12
+        vmov            q12, q13
+
+        pop             {pc}
+endfunc
+
+.macro sum_lag2_func type, uv_layout, edge, elems=16
+function sum_\type\()_lag2_\edge\()_neon
+        push            {r1, lr}
+.ifc \edge, left
+        sub             r12, r0,  #2*GRAIN_WIDTH
+        sub             lr,  r0,  #1*GRAIN_WIDTH
+        vld1.8          {q9},  [r12] // load the previous block right above
+        vld1.8          {q12}, [lr]
+.endif
+        sum_lag_n_body  lag2, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=d29[4]
+endfunc
+.endm
+
+sum_lag2_func y,      0,   left
+sum_lag2_func y,      0,   mid
+sum_lag2_func y,      0,   right, 15
+sum_lag2_func uv_444, 444, left
+sum_lag2_func uv_444, 444, mid
+sum_lag2_func uv_444, 444, right, 15
+sum_lag2_func uv_422, 422, left
+sum_lag2_func uv_422, 422, mid
+sum_lag2_func uv_422, 422, right, 9
+sum_lag2_func uv_420, 420, left
+sum_lag2_func uv_420, 420, mid
+sum_lag2_func uv_420, 420, right, 9
+
+
+function sum_lag3_left_above_neon
+        // A separate codepath for the left edge, to avoid reading outside
+        // of the edge of the buffer.
+        sub             r12, r0,  #3*GRAIN_WIDTH
+        vld1.8          {q11, q12}, [r12]
+        vext.8          q12, q11, q12, #13
+        vext.8          q11, q11, q11, #13
+        b               sum_lag3_above_start
+endfunc
+
+function sum_lag3_above_neon
+        sub             r12, r0,  #3*GRAIN_WIDTH + 3
+        vld1.8          {q11, q12}, [r12]
+
+sum_lag3_above_start:
+        vdup.8          d20, d26[0]
+        vext.8          q9,  q11, q12, #1
+        vdup.8          d21, d26[1]
+
+        vmull.s8        q0,  d22, d20
+        vmull.s8        q1,  d23, d20
+        vmull.s8        q6,  d18, d21
+        vmull.s8        q7,  d19, d21
+
+        vext.8          q8,  q11, q12, #2
+        vdup.8          d20, d26[2]
+        vext.8          q9,  q11, q12, #3
+        vdup.8          d21, d26[3]
+
+        vaddl.s16       q2,  d0,  d12
+        vaddl.s16       q3,  d1,  d13
+        vaddl.s16       q4,  d2,  d14
+        vaddl.s16       q5,  d3,  d15
+
+        vmull.s8        q0,  d16, d20
+        vmull.s8        q1,  d17, d20
+        vmull.s8        q6,  d18, d21
+        vmull.s8        q7,  d19, d21
+
+        vaddl.s16       q8,  d0,  d12
+        vaddl.s16       q9,  d1,  d13
+        vaddl.s16       q0,  d2,  d14
+        vaddl.s16       q1,  d3,  d15
+
+        vext.8          q6,  q11, q12, #4
+        vdup.8          d20, d26[4]
+        vext.8          q7,  q11, q12, #5
+        vdup.8          d21, d26[5]
+
+        vadd.i32        q2,  q2,  q8
+        vadd.i32        q3,  q3,  q9
+        vadd.i32        q4,  q4,  q0
+        vadd.i32        q5,  q5,  q1
+
+        vmull.s8        q0,  d12, d20
+        vmull.s8        q1,  d13, d20
+        vmull.s8        q8,  d14, d21
+        vmull.s8        q9,  d15, d21
+
+        sub             r12, r0,  #2*GRAIN_WIDTH + 3
+
+        vaddl.s16       q6,  d0,  d16
+        vaddl.s16       q7,  d1,  d17
+        vaddl.s16       q0,  d2,  d18
+        vaddl.s16       q1,  d3,  d19
+
+        vext.8          q8,  q11, q12, #6
+        vld1.8          {q11, q12}, [r12]
+        vdup.8          d20, d26[6]
+        vdup.8          d21, d26[7]
+
+        vadd.i32        q2,  q2,  q6
+        vadd.i32        q3,  q3,  q7
+        vadd.i32        q4,  q4,  q0
+        vadd.i32        q5,  q5,  q1
+
+        vmull.s8        q0,  d16, d20
+        vmull.s8        q1,  d17, d20
+        vmull.s8        q6,  d22, d21
+        vmull.s8        q7,  d23, d21
+
+        vaddl.s16       q8,  d0,  d12
+        vaddl.s16       q9,  d1,  d13
+        vaddl.s16       q0,  d2,  d14
+        vaddl.s16       q1,  d3,  d15
+
+        vext.8          q6,  q11, q12, #1
+        vdup.8          d20, d27[0]
+        vext.8          q7,  q11, q12, #2
+        vdup.8          d21, d27[1]
+
+        vadd.i32        q2,  q2,  q8
+        vadd.i32        q3,  q3,  q9
+        vadd.i32        q4,  q4,  q0
+        vadd.i32        q5,  q5,  q1
+
+        vmull.s8        q0,  d12, d20
+        vmull.s8        q1,  d13, d20
+        vmull.s8        q8,  d14, d21
+        vmull.s8        q9,  d15, d21
+
+        vaddl.s16       q6,  d0,  d16
+        vaddl.s16       q7,  d1,  d17
+        vaddl.s16       q0,  d2,  d18
+        vaddl.s16       q1,  d3,  d19
+
+        vext.8          q8,  q11, q12, #3
+        vdup.8          d20, d27[2]
+        vext.8          q9,  q11, q12, #4
+        vdup.8          d21, d27[3]
+
+        vadd.i32        q2,  q2,  q6
+        vadd.i32        q3,  q3,  q7
+        vadd.i32        q4,  q4,  q0
+        vadd.i32        q5,  q5,  q1
+
+        vmull.s8        q0,  d16, d20
+        vmull.s8        q1,  d17, d20
+        vmull.s8        q6,  d18, d21
+        vmull.s8        q7,  d19, d21
+
+        sub             r12, r0,  #1*GRAIN_WIDTH + 3
+
+        vaddl.s16       q8,  d0,  d12
+        vaddl.s16       q9,  d1,  d13
+        vaddl.s16       q0,  d2,  d14
+        vaddl.s16       q1,  d3,  d15
+
+        vext.8          q6,  q11, q12, #5
+        vdup.8          d20, d27[4]
+        vext.8          q7,  q11, q12, #6
+        vdup.8          d21, d27[5]
+
+        vld1.8          {q11, q12}, [r12]
+
+        vadd.i32        q2,  q2,  q8
+        vadd.i32        q3,  q3,  q9
+        vadd.i32        q4,  q4,  q0
+        vadd.i32        q5,  q5,  q1
+
+        vmull.s8        q0,  d12, d20
+        vmull.s8        q1,  d13, d20
+        vmull.s8        q8,  d14, d21
+        vmull.s8        q9,  d15, d21
+
+        vaddl.s16       q6,  d0,  d16
+        vaddl.s16       q7,  d1,  d17
+        vaddl.s16       q0,  d2,  d18
+        vaddl.s16       q1,  d3,  d19
+
+        vdup.8          d20, d27[6]
+        vext.8          q9,  q11, q12, #1
+        vdup.8          d21, d27[7]
+
+        vadd.i32        q2,  q2,  q6
+        vadd.i32        q3,  q3,  q7
+        vadd.i32        q4,  q4,  q0
+        vadd.i32        q5,  q5,  q1
+
+        vmull.s8        q0,  d22, d20
+        vmull.s8        q1,  d23, d20
+        vmull.s8        q6,  d18, d21
+        vmull.s8        q7,  d19, d21
+
+        vaddl.s16       q8,  d0,  d12
+        vaddl.s16       q9,  d1,  d13
+        vaddl.s16       q0,  d2,  d14
+        vaddl.s16       q1,  d3,  d15
+
+        vext.8          q6,  q11, q12, #2
+        vdup.8          d20, d28[0]
+        vext.8          q7,  q11, q12, #3
+        vdup.8          d21, d28[1]
+
+        vadd.i32        q2,  q2,  q8
+        vadd.i32        q3,  q3,  q9
+        vadd.i32        q4,  q4,  q0
+        vadd.i32        q5,  q5,  q1
+
+        vmull.s8        q0,  d12, d20
+        vmull.s8        q1,  d13, d20
+        vmull.s8        q8,  d14, d21
+        vmull.s8        q9,  d15, d21
+
+        vaddl.s16       q6,  d0,  d16
+        vaddl.s16       q7,  d1,  d17
+        vaddl.s16       q0,  d2,  d18
+        vaddl.s16       q1,  d3,  d19
+
+        vext.8          q8,  q11, q12, #4
+        vdup.8          d20, d28[2]
+        vext.8          q9,  q11, q12, #5
+        vdup.8          d21, d28[3]
+
+        vadd.i32        q2,  q2,  q6
+        vadd.i32        q3,  q3,  q7
+        vadd.i32        q4,  q4,  q0
+        vadd.i32        q5,  q5,  q1
+
+        vmull.s8        q0,  d16, d20
+        vmull.s8        q1,  d17, d20
+        vmull.s8        q6,  d18, d21
+        vmull.s8        q7,  d19, d21
+
+        vaddl.s16       q8,  d0,  d12
+        vaddl.s16       q9,  d1,  d13
+        vaddl.s16       q0,  d2,  d14
+        vaddl.s16       q1,  d3,  d15
+
+        vext.8          q6,  q11, q12, #6
+        vdup.8          d20, d28[4]
+
+        vadd.i32        q2,  q2,  q8
+        vadd.i32        q3,  q3,  q9
+        vadd.i32        q4,  q4,  q0
+        vadd.i32        q5,  q5,  q1
+
+        vmull.s8        q0,  d12, d20
+        vmull.s8        q1,  d13, d20
+
+        vaddw.s16       q2,  q2,  d0
+        vaddw.s16       q3,  q3,  d1
+        vaddw.s16       q4,  q4,  d2
+        vaddw.s16       q5,  q5,  d3
+
+        bx              lr
+endfunc
+
+.macro sum_lag3_func type, uv_layout, edge, elems=16
+function sum_\type\()_lag3_\edge\()_neon
+        push            {r1, lr}
+        sum_lag_n_body  lag3, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=d29[0]
+endfunc
+.endm
+
+sum_lag3_func y,      0,   left
+sum_lag3_func y,      0,   mid
+sum_lag3_func y,      0,   right, 15
+sum_lag3_func uv_444, 444, left
+sum_lag3_func uv_444, 444, mid
+sum_lag3_func uv_444, 444, right, 15
+sum_lag3_func uv_422, 422, left
+sum_lag3_func uv_422, 422, mid
+sum_lag3_func uv_422, 422, right, 9
+sum_lag3_func uv_420, 420, left
+sum_lag3_func uv_420, 420, mid
+sum_lag3_func uv_420, 420, right, 9
+
+function generate_grain_rows_neon
+        push            {r11,lr}
+1:
+        get_grain_row   d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26
+        subs            r1,  r1,  #1
+        store_grain_row d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26
+        bgt             1b
+        pop             {r11,pc}
+endfunc
+
+function generate_grain_rows_44_neon
+        push            {r11,lr}
+1:
+        get_grain_row_44 d16, d17, d18, d19, d20, d21
+        subs            r1,  r1,  #1
+        store_grain_row_44 d16, d17, d18, d19, d20, d21
+        bgt             1b
+        pop             {r11,pc}
+endfunc
+
+function gen_grain_uv_444_lag0_neon
+        vld1.8          {q3}, [r11]!
+        push            {r11,lr}
+        bl              get_gaussian_neon
+        vrshl.s16       q8,  q0,  q15
+        bl              get_gaussian_neon
+        vrshl.s16       q9,  q0,  q15
+        vqmovn.s16      d0,  q8
+        vqmovn.s16      d1,  q9
+
+        vand            q3,  q3,  q1
+        vmull.s8        q2,  d6,  d22
+        vmull.s8        q3,  d7,  d22
+        vrshl.s16       q2,  q2,  q12
+        vrshl.s16       q3,  q3,  q12
+        vaddw.s8        q2,  q2,  d0
+        vaddw.s8        q3,  q3,  d1
+        vqmovn.s16      d4,  q2
+        vqmovn.s16      d5,  q3
+        vst1.8          {q2}, [r0]!
+        pop             {r11,pc}
+endfunc
+
+function get_grain_row_44_neon
+        push            {r11,lr}
+        get_grain_row_44 d16, d17, d18, d19, d20, d21
+        pop             {r11,pc}
+endfunc
+
+function add_uv_420_coeff_lag0_neon
+        vld1.16         {q2, q3}, [r11]!
+        vld1.16         {q4, q5}, [r12]!
+        vpaddl.s8       q2,  q2
+        vpaddl.s8       q3,  q3
+        vpaddl.s8       q4,  q4
+        vpaddl.s8       q5,  q5
+        vadd.i16        q2,  q2,  q4
+        vadd.i16        q3,  q3,  q5
+        vrshrn.s16      d4,  q2,  #2
+        vrshrn.s16      d5,  q3,  #2
+        b               add_coeff_lag0_start
+endfunc
+
+function add_uv_422_coeff_lag0_neon
+        vld1.16         {q2, q3}, [r11]!
+        vpaddl.s8       q2,  q2
+        vpaddl.s8       q3,  q3
+        vrshrn.s16      d4,  q2,  #1
+        vrshrn.s16      d5,  q3,  #1
+
+add_coeff_lag0_start:
+        vand            q3,  q2,  q1
+        vmull.s8        q2,  d6,  d22
+        vmull.s8        q3,  d7,  d22
+        vrshl.s16       q2,  q2,  q12
+        vrshl.s16       q3,  q3,  q12
+        vaddw.s8        q2,  q2,  d0
+        vaddw.s8        q3,  q3,  d1
+        vqmovn.s16      d4,  q2
+        vqmovn.s16      d5,  q3
+        bx              lr
+endfunc
+
+.macro gen_grain_82 type
+function generate_grain_\type\()_8bpc_neon, export=1
+        push            {r4-r11,lr}
+
+.ifc \type, uv_444
+        mov             r12, r3
+        mov             lr,  #28
+        add             r11, r1,  #3*GRAIN_WIDTH
+        mov             r1,  r2
+        mul             r12, r12, lr
+.endif
+        movrel          r3,  X(gaussian_sequence)
+        ldr             r2,  [r1, #FGD_SEED]
+        ldr             r9,  [r1, #FGD_GRAIN_SCALE_SHIFT]
+.ifc \type, y
+        add             r4,  r1,  #FGD_AR_COEFFS_Y
+.else
+        add             r4,  r1,  #FGD_AR_COEFFS_UV
+.endif
+        adr             r5,  L(gen_grain_\type\()_tbl)
+        ldr             r6,  [r1, #FGD_AR_COEFF_LAG]
+        add             r9,  r9,  #4
+        ldr             r6,  [r5, r6, lsl #2]
+        vdup.16         q15, r9    // 4 + data->grain_scale_shift
+        add             r5,  r5,  r6
+        vneg.s16        q15, q15
+
+.ifc \type, uv_444
+        cmp             r12, #0
+        movw            r10, #0x49d8
+        movw            lr,  #0xb524
+        // Intentionally using a separate register instead of moveq with an
+        // immediate constant, to avoid armv8 deprecated it instruction forms.
+        it              eq
+        moveq           r10, lr
+        add             r4,  r4,  r12       // Add offset to ar_coeffs_uv[1]
+        eor             r2,  r2,  r10
+.endif
+
+        ldr             r7,  [r1, #FGD_AR_COEFF_SHIFT]
+        mov             r8,  #1
+        mov             r10, #1
+        lsl             r8,  r8,  r7        // 1 << ar_coeff_shift
+        lsl             r10, r10, r9        // 1 << (4 + data->grain_scale_shift)
+        lsr             r8,  r8,  #1        // 1 << (ar_coeff_shift - 1)
+        lsr             r10, r10, #1        // 1 << (4 + data->grain_scale_shift - 1)
+
+        bx              r5
+
+        .align 2
+L(gen_grain_\type\()_tbl):
+        .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+        .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+        .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+        .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+
+L(generate_grain_\type\()_lag0):
+.ifc \type, y
+        mov             r1,  #GRAIN_HEIGHT
+        bl              generate_grain_rows_neon
+.else
+
+        mov             r1,  #3
+        bl              generate_grain_rows_neon
+        mov             r1,  #GRAIN_HEIGHT-3
+
+        vdup.16         q12, r7
+        vld1.8          {d22[]}, [r4]       // ar_coeffs_uv[0]
+        vmov.i8         q0,  #0
+        vmov.i8         q1,  #255
+        vext.8          q13, q0,  q1,  #13
+        vext.8          q14, q1,  q0,  #1
+        vneg.s16        q12, q12
+
+1:
+        vmov            q1,  q13
+        bl              gen_grain_uv_444_lag0_neon // 16
+        vmov.i8         q1,  #255
+        bl              gen_grain_uv_444_lag0_neon // 32
+        bl              gen_grain_uv_444_lag0_neon // 48
+        bl              gen_grain_uv_444_lag0_neon // 64
+        vmov            q1,  q14
+        bl              gen_grain_uv_444_lag0_neon // 80
+        get_grain_2     d16
+        subs            r1,  r1,  #1
+        add             r11, r11, #2
+        vst1.16         {d16[0]}, [r0]!
+        bgt             1b
+.endif
+        pop             {r4-r11,pc}
+
+L(generate_grain_\type\()_lag1):
+        vpush           {q4-q7}
+        mov             r5,  #127
+        vld1.8          {d27[]}, [r4]!      // ar_coeffs_y[0]
+        vld1.8          {d28[]}, [r4]!      // ar_coeffs_y[1]
+        vld1.8          {d29[]}, [r4]       // ar_coeffs_y[2]
+.ifc \type, y
+        ldrsb           r4,  [r4, #1]       // ar_coeffs_y[3]
+.else
+        add             r4,  r4,  #2
+.endif
+
+        mov             r1,  #3
+.ifc \type, uv_444
+        vld1.8          {d13[]}, [r4]       // ar_coeffs_uv[4]
+        ldrsb           r4,  [r4, #-1]      // ar_coeffs_uv[3]
+.endif
+        bl              generate_grain_rows_neon
+
+        mov             r1,  #GRAIN_HEIGHT - 3
+1:
+        sum_\type\()_lag1 q7,  q8,  q8,  q9,  left
+        sum_\type\()_lag1 q8,  q8,  q9,  q10
+        sum_\type\()_lag1 q9,  q9,  q10, q11
+        sum_\type\()_lag1 q10, q10, q11, q12
+        sum_\type\()_lag1 q12, q11, q12, q13, right
+        get_grain_2     d26
+        subs            r1,  r1,  #1
+.ifc \type, uv_444
+        add             r11, r11, #2
+.endif
+        store_grain_row d14, d15, d16, d17, d18, d19, d20, d21, d24, d25, d26
+        vmov            q11, q10
+        vmov            q10, q9
+        vmov            q9,  q8
+        vmov            q8,  q7
+        bgt             1b
+
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+
+L(generate_grain_\type\()_lag2):
+        vpush           {q4-q7}
+        mov             r5,  #127
+        vld1.8          {d28,d29}, [r4]     // ar_coeffs_y[0-11], ar_coeffs_uv[0-12]
+
+        vmov.s8         r4,  d29[2]
+        vmov.s8         r10, d29[3]
+
+        mov             r1,  #3
+        bl              generate_grain_rows_neon
+
+        mov             r1,  #GRAIN_HEIGHT - 3
+1:
+        bl              sum_\type\()_lag2_left_neon
+        bl              sum_\type\()_lag2_mid_neon
+        bl              sum_\type\()_lag2_mid_neon
+        bl              sum_\type\()_lag2_mid_neon
+        bl              sum_\type\()_lag2_right_neon
+        get_grain_2     d16
+        subs            r1,  r1,  #1
+.ifc \type, uv_444
+        add             r11, r11, #2
+.endif
+        vst1.16         {d16[0]}, [r0]!
+        bgt             1b
+
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+
+L(generate_grain_\type\()_lag3):
+        vpush           {q4-q7}
+        mov             r5,  #127
+        vld1.8          {q13, q14}, [r4]    // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
+
+        vmov.u8         r4,  d28[5]
+        vmov.u8         r10, d28[6]
+        vmov.u8         r12, d28[7]
+
+        orr             r4,  r4,  r10, lsl #8
+        orr             r4,  r4,  r12, lsl #16
+
+        mov             r1,  #3
+        vpush           {d26}
+        bl              generate_grain_rows_neon
+        vpop            {d26}
+
+        mov             r1,  #GRAIN_HEIGHT - 3
+1:
+        bl              sum_\type\()_lag3_left_neon
+        bl              sum_\type\()_lag3_mid_neon
+        bl              sum_\type\()_lag3_mid_neon
+        bl              sum_\type\()_lag3_mid_neon
+        bl              sum_\type\()_lag3_right_neon
+        get_grain_2     d16
+        subs            r1,  r1,  #1
+.ifc \type, uv_444
+        add             r11, r11, #2
+.endif
+        vst1.16         {d16[0]}, [r0]!
+        bgt             1b
+
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+.endm
+
+gen_grain_82 y
+gen_grain_82 uv_444
+
+.macro set_height dst, type
+.ifc \type, uv_420
+        mov             \dst,  #SUB_GRAIN_HEIGHT-3
+.else
+        mov             \dst,  #GRAIN_HEIGHT-3
+.endif
+.endm
+
+.macro increment_y_ptr reg, type
+.ifc \type, uv_420
+        add             \reg, \reg, #2*GRAIN_WIDTH-(3*32)
+.else
+        sub             \reg, \reg, #3*32-GRAIN_WIDTH
+.endif
+.endm
+
+.macro gen_grain_44 type
+function generate_grain_\type\()_8bpc_neon, export=1
+        push            {r4-r11,lr}
+
+        mov             r12, r3
+        mov             lr,  #28
+        add             r11, r1,  #3*GRAIN_WIDTH-3
+        mov             r1,  r2
+        mul             r12, r12, lr
+
+        movrel          r3,  X(gaussian_sequence)
+        ldr             r2,  [r1, #FGD_SEED]
+        ldr             r9,  [r1, #FGD_GRAIN_SCALE_SHIFT]
+        add             r4,  r1,  #FGD_AR_COEFFS_UV
+        adr             r5,  L(gen_grain_\type\()_tbl)
+        ldr             r6,  [r1, #FGD_AR_COEFF_LAG]
+        add             r9,  r9,  #4
+        ldr             r6,  [r5, r6, lsl #2]
+        vdup.16         q15, r9    // 4 + data->grain_scale_shift
+        add             r5,  r5,  r6
+        vneg.s16        q15, q15
+
+        cmp             r12, #0
+        movw            r10, #0x49d8
+        movw            lr,  #0xb524
+        // Intentionally using a separate register instead of moveq with an
+        // immediate constant, to avoid armv8 deprecated it instruction forms.
+        it              eq
+        moveq           r10, lr
+        add             r4,  r4,  r12       // Add offset to ar_coeffs_uv[1]
+        eor             r2,  r2,  r10
+
+        ldr             r7,  [r1, #FGD_AR_COEFF_SHIFT]
+        mov             r8,  #1
+        mov             r10, #1
+        lsl             r8,  r8,  r7        // 1 << ar_coeff_shift
+        lsl             r10, r10, r9        // 1 << (4 + data->grain_scale_shift)
+        lsr             r8,  r8,  #1        // 1 << (ar_coeff_shift - 1)
+        lsr             r10, r10, #1        // 1 << (4 + data->grain_scale_shift - 1)
+        bx              r5
+
+        .align 2
+L(gen_grain_\type\()_tbl):
+        .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+        .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+        .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+        .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+
+L(generate_grain_\type\()_lag0):
+.ifc \type, uv_420
+        vpush           {q4-q5}
+.endif
+        mov             r1,  #3
+        bl              generate_grain_rows_44_neon
+        set_height      r1,  \type
+
+        vdup.16         q12, r7
+        vld1.8          {d22[]}, [r4]       // ar_coeffs_uv[0]
+        vmov.i8         q0,  #0
+        vmov.i8         q1,  #255
+        vext.8          q13, q0,  q1,  #13
+        vext.8          q14, q1,  q0,  #7
+        vneg.s16        q12, q12
+
+1:
+        bl              get_grain_row_44_neon
+.ifc \type, uv_420
+        add             r12, r11, #GRAIN_WIDTH
+.endif
+        vmov            q1,  q13
+        vmov            q0,  q8
+        bl              add_\type\()_coeff_lag0_neon
+        vmov.i8         q1,  #255
+        vmov            q0,  q9
+        vmov            q8,  q2
+        bl              add_\type\()_coeff_lag0_neon
+        vmov.i8         q1,  q14
+        vmov            q0,  q10
+        vmov            q9,  q2
+        bl              add_\type\()_coeff_lag0_neon
+        vmov            q10, q2
+        subs            r1,  r1,  #1
+        increment_y_ptr r11, \type
+        store_grain_row_44 d16, d17, d18, d19, d20, d21
+        bgt             1b
+
+.ifc \type, uv_420
+        vpop            {q4-q5}
+.endif
+        pop             {r4-r11,pc}
+
+L(generate_grain_\type\()_lag1):
+        vpush           {q4-q7}
+        mov             r5,  #127
+        vld1.8          {d27[]}, [r4]!      // ar_coeffs_uv[0]
+        vld1.8          {d28[]}, [r4]!      // ar_coeffs_uv[1]
+        vld1.8          {d29[]}, [r4]       // ar_coeffs_uv[2]
+        add             r4,  r4,  #2
+
+        mov             r1,  #3
+        vld1.8          {d13[]}, [r4]       // ar_coeffs_uv[4]
+        ldrsb           r4,  [r4, #-1]      // ar_coeffs_uv[3]
+        bl              generate_grain_rows_44_neon
+
+        set_height      r1,  \type
+1:
+        sum_\type\()_lag1 q7,  q8,  q8,  q9,  left
+        sum_\type\()_lag1 q8,  q8,  q9,  q10
+        sum_\type\()_lag1 q10, q9,  q10, q11, right
+        subs            r1,  r1,  #1
+        increment_y_ptr r11, \type
+        store_grain_row_44 d14, d15, d16, d17, d20, d21
+        vmov            q9,  q8
+        vmov            q8,  q7
+        bgt             1b
+
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+
+L(generate_grain_\type\()_lag2):
+        vpush           {q4-q7}
+        mov             r5,  #127
+        vld1.8          {d28,d29}, [r4]     // ar_coeffs_uv[0-12]
+
+        vmov.s8         r4,  d29[2]
+        vmov.s8         r10, d29[3]
+
+        mov             r1,  #3
+        bl              generate_grain_rows_44_neon
+
+        set_height      r1,  \type
+1:
+        bl              sum_\type\()_lag2_left_neon
+        bl              sum_\type\()_lag2_mid_neon
+        bl              sum_\type\()_lag2_right_neon
+        subs            r1,  r1,  #1
+        increment_y_ptr r11, \type
+        add             r0,  r0,  #GRAIN_WIDTH-48
+        bgt             1b
+
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+
+L(generate_grain_\type\()_lag3):
+        vpush           {q4-q7}
+        mov             r5,  #127
+        vld1.8          {q13, q14}, [r4]    // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
+
+        vmov.u8         r4,  d28[5]
+        vmov.u8         r10, d28[6]
+        vmov.u8         r12, d28[7]
+
+        orr             r4,  r4,  r10, lsl #8
+        orr             r4,  r4,  r12, lsl #16
+
+        mov             r1,  #3
+        bl              generate_grain_rows_44_neon
+
+        set_height      r1,  \type
+1:
+        bl              sum_\type\()_lag3_left_neon
+        bl              sum_\type\()_lag3_mid_neon
+        bl              sum_\type\()_lag3_right_neon
+        subs            r1,  r1,  #1
+        increment_y_ptr r11, \type
+        add             r0,  r0,  #GRAIN_WIDTH-48
+        bgt             1b
+
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+.endm
+
+gen_grain_44 uv_420
+gen_grain_44 uv_422
+
+.macro gather_interleaved dst1, dst2, src1, src2, off
+        vmov.u8         r11, \src1[0+\off]
+        vmov.u8         r12, \src2[0+\off]
+        add             r11, r11, r3
+        vmov.u8         lr,  \src1[2+\off]
+        add             r12, r12, r3
+        vld1.8          {\dst1[0+\off]}, [r11]
+        vmov.u8         r11, \src2[2+\off]
+        add             lr,  lr,  r3
+        vld1.8          {\dst2[0+\off]}, [r12]
+        vmov.u8         r12, \src1[4+\off]
+        add             r11, r11, r3
+        vld1.8          {\dst1[2+\off]}, [lr]
+        vmov.u8         lr,  \src2[4+\off]
+        add             r12, r12, r3
+        vld1.8          {\dst2[2+\off]}, [r11]
+        vmov.u8         r11, \src1[6+\off]
+        add             lr,  lr,  r3
+        vld1.8          {\dst1[4+\off]}, [r12]
+        vmov.u8         r12, \src2[6+\off]
+        add             r11, r11, r3
+        vld1.8          {\dst2[4+\off]}, [lr]
+        add             r12, r12, r3
+        vld1.8          {\dst1[6+\off]}, [r11]
+        vld1.8          {\dst2[6+\off]}, [r12]
+.endm
+
+.macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4
+        gather_interleaved \dst1, \dst3, \src1, \src3, 0
+        gather_interleaved \dst1, \dst3, \src1, \src3, 1
+        gather_interleaved \dst2, \dst4, \src2, \src4, 0
+        gather_interleaved \dst2, \dst4, \src2, \src4, 1
+.endm
+
+function gather32_neon
+        push            {r11-r12,lr}
+        gather          d8,  d9,  d10, d11, d0,  d1,  d2,  d3
+        pop             {r11-r12,pc}
+endfunc
+
+function gather16_neon
+        push            {r11-r12,lr}
+        gather_interleaved d8,  d9,  d0,  d1,  0
+        gather_interleaved d8,  d9,  d0,  d1,  1
+        pop             {r11-r12,pc}
+endfunc
+
+const overlap_coeffs_0, align=4
+        .byte 27, 17, 0,  0,  0,  0,  0,  0
+        .byte 17, 27, 32, 32, 32, 32, 32, 32
+endconst
+
+const overlap_coeffs_1, align=4
+        .byte 23, 0,  0,  0,  0,  0,  0,  0
+        .byte 22, 32, 32, 32, 32, 32, 32, 32
+endconst
+
+.macro calc_offset offx, offy, src, sx, sy
+        and             \offy, \src,  #0xF     // randval & 0xF
+        lsr             \offx, \src,  #4       // randval >> 4
+.if \sy == 0
+        add             \offy, \offy, \offy    // 2 * (randval & 0xF)
+.endif
+.if \sx == 0
+        add             \offx, \offx, \offx    // 2 * (randval >> 4)
+.endif
+.endm
+
+.macro add_offset dst, offx, offy, src, stride
+        mla             \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
+        add             \dst, \dst, \offx          // grain_lut += offx
+.endm
+
+// void dav1d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src,
+//                                const ptrdiff_t stride,
+//                                const uint8_t scaling[SCALING_SIZE],
+//                                const int scaling_shift,
+//                                const entry grain_lut[][GRAIN_WIDTH],
+//                                const int offsets[][2],
+//                                const int h, const ptrdiff_t clip,
+//                                const ptrdiff_t type);
+function fgy_32x32_8bpc_neon, export=1
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #100]   // scaling_shift, grain_lut
+        ldrd            r6,  r7,  [sp, #108]   // offsets, h
+        ldr             r8,       [sp, #116]   // clip
+        mov             r9,  #GRAIN_WIDTH      // grain_lut stride
+
+        neg             r4,  r4
+        vdup.16         q13, r4                // -scaling_shift
+        cmp             r8,  #0
+
+        movrel_local    r12, overlap_coeffs_0
+
+        beq             1f
+        // clip
+        vmov.i8         q14, #16
+        vmov.i8         q15, #235
+        b               2f
+1:
+        // no clip
+        vmov.i8         q14, #0
+        vmov.i8         q15, #255
+2:
+
+        vld1.8          {d24, d25}, [r12, :128] // overlap_coeffs
+
+        add             r5,  r5,  #9           // grain_lut += 9
+        add             r5,  r5,  r9,  lsl #3  // grain_lut += 8 * grain_stride
+        add             r5,  r5,  r9           // grain_lut += grain_stride
+
+        ldr             r10, [r6, #8]          // offsets[1][0]
+        calc_offset     r10, r4,  r10, 0,   0
+        add_offset      r4,  r10, r4,  r5,  r9
+        ldr             r10, [r6, #4]          // offsets[0][1]
+        calc_offset     r10, r11, r10, 0,   0
+        add_offset      r11, r10, r11, r5,  r9
+        ldr             r10, [r6, #12]         // offsets[1][1]
+        calc_offset     r10, r8,  r10, 0,   0
+        add_offset      r8,  r10, r8,  r5,  r9
+        ldr             r6,  [r6]              // offsets[0][0]
+        calc_offset     r6,  lr,  r6,  0,   0
+        add_offset      r5,  r6,  lr,  r5,  r9
+
+        add             r4,  r4,  #32          // grain_lut += BLOCK_SIZE * bx
+        add             r6,  r11, r9,  lsl #5  // grain_lut += grain_stride * BLOCK_SIZE * by
+
+        ldr             r10, [sp, #120]        // type
+        adr             r11, L(fgy_loop_tbl)
+
+        tst             r10, #1
+        ldr             r10, [r11, r10, lsl #2]
+
+        add             r8,  r8,  r9,  lsl #5  // grain_lut += grain_stride * BLOCK_SIZE * by
+        add             r8,  r8,  #32          // grain_lut += BLOCK_SIZE * bx
+
+        add             r11, r11, r10
+
+        beq             1f
+        // y overlap
+        vdup.8          d14, d24[0]
+        vdup.8          d15, d24[1]
+        mov             r10, r7                // backup actual h
+        mov             r7,  #2
+1:
+        bx              r11
+endfunc
+
+function fgy_loop_neon
+L(fgy_loop_tbl):
+        .word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB
+        .word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB
+        .word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB
+        .word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB
+
+.macro fgy ox, oy
+L(loop_\ox\oy):
+1:
+.if \ox
+        vld1.8          {d8},       [r4],       r9 // grain_lut old
+.endif
+.if \oy
+        vld1.8          {q2, q3},   [r6],       r9 // grain_lut top
+.endif
+.if \ox && \oy
+        vld1.8          {d10},      [r8],       r9 // grain_lut top old
+.endif
+        vld1.8          {q0,  q1},  [r1, :128], r2 // src
+        vld1.8          {q10, q11}, [r5],       r9 // grain_lut
+
+.if \ox
+        vmull.s8        q4,  d8,  d24
+        vmlal.s8        q4,  d20, d25
+.endif
+
+.if \oy
+.if \ox
+        vmull.s8        q5,  d10, d24
+        vmlal.s8        q5,  d4,  d25
+        vqrshrn.s16     d20, q4,  #5
+        vqrshrn.s16     d4,  q5,  #5
+.endif
+
+        vmull.s8        q4,  d20, d15
+        vmull.s8        q5,  d21, d15
+        vmull.s8        q8,  d22, d15
+        vmull.s8        q9,  d23, d15
+        vmlal.s8        q4,  d4,  d14
+        vmlal.s8        q5,  d5,  d14
+        vmlal.s8        q8,  d6,  d14
+        vmlal.s8        q9,  d7,  d14
+        vqrshrn.s16     d20, q4,  #5
+        vqrshrn.s16     d21, q5,  #5
+        vqrshrn.s16     d22, q8,  #5
+        vqrshrn.s16     d23, q9,  #5
+.elseif \ox
+        vqrshrn.s16     d20, q4,  #5
+.endif
+
+        bl              gather32_neon
+
+        vmovl.s8        q8,  d20       // grain
+        vmovl.s8        q9,  d21
+        vmovl.s8        q10, d22
+        vmovl.s8        q11, d23
+
+        vmovl.u8        q2,  d8        // scaling
+        vmovl.u8        q3,  d9
+        vmovl.u8        q4,  d10
+        vmovl.u8        q5,  d11
+
+        vmul.i16        q8,  q8,  q2   // scaling * grain
+        vmul.i16        q9,  q9,  q3
+        vmul.i16        q10, q10, q4
+        vmul.i16        q11, q11, q5
+
+        vrshl.s16       q8,  q8,  q13  // round2(scaling * grain, scaling_shift)
+        vrshl.s16       q9,  q9,  q13
+        vrshl.s16       q10, q10, q13
+        vrshl.s16       q11, q11, q13
+
+        vaddw.u8        q8,  q8,  d0   // *src + noise
+        vaddw.u8        q9,  q9,  d1
+        vaddw.u8        q10, q10, d2
+        vaddw.u8        q11, q11, d3
+
+        vqmovun.s16     d0,  q8
+        vqmovun.s16     d1,  q9
+        vqmovun.s16     d2,  q10
+        vqmovun.s16     d3,  q11
+
+        vmax.u8         q0,  q0,  q14
+        vmax.u8         q1,  q1,  q14
+        vmin.u8         q0,  q0,  q15
+        vmin.u8         q1,  q1,  q15
+
+        subs            r7,  r7,  #1
+.if \oy
+        vdup.8          d14, d25[0]
+        vdup.8          d15, d25[1]
+.endif
+        vst1.8          {q0, q1}, [r0, :128], r2 // dst
+        bgt             1b
+
+.if \oy
+        cmp             r10, #2
+        sub             r7,  r10, #2           // restore actual remaining h
+        bgt             L(loop_\ox\()0)
+.endif
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+.endm
+
+        fgy             0, 0
+        fgy             0, 1
+        fgy             1, 0
+        fgy             1, 1
+endfunc
+
+// void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst,
+//                                     const pixel *const src,
+//                                     const ptrdiff_t stride,
+//                                     const uint8_t scaling[SCALING_SIZE],
+//                                     const Dav1dFilmGrainData *const data,
+//                                     const entry grain_lut[][GRAIN_WIDTH],
+//                                     const pixel *const luma_row,
+//                                     const ptrdiff_t luma_stride,
+//                                     const int offsets[][2],
+//                                     const ptrdiff_t h, const ptrdiff_t uv,
+//                                     const ptrdiff_t is_id,
+//                                     const ptrdiff_t type);
+.macro fguv layout, sx, sy
+function fguv_32x32_\layout\()_8bpc_neon, export=1
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #100]   // data, grain_lut
+        ldrd            r6,  r7,  [sp, #108]   // luma_row, luma_stride
+        ldrd            r8,  r9,  [sp, #116]   // offsets, h
+        ldrd            r10, r11, [sp, #124]   // uv, is_id
+
+        // !csfl
+        add             r10, r4,  r10, lsl #2  // + 4*uv
+        add             r12, r10, #FGD_UV_LUMA_MULT
+        add             lr,  r10, #FGD_UV_MULT
+        add             r10, r10, #FGD_UV_OFFSET
+        vld1.16         {d4[]},  [r12]         // uv_luma_mult
+        vld1.16         {d4[2]}, [r10]         // uv_offset
+        vld1.16         {d4[1]}, [lr]          // uv_mult
+
+        ldr             lr,  [r4, #FGD_SCALING_SHIFT]
+        ldr             r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE]
+        neg             lr,  lr                // -scaling_shift
+
+        cmp             r12, #0
+        vdup.16         q13, lr                // -scaling_shift
+
+        beq             1f
+        // clip
+        cmp             r11, #0
+        vmov.i8         q14, #16
+        vmov.i8         q15, #240
+        beq             2f
+        // is_id
+        vmov.i8         q15, #235
+        b               2f
+1:
+        // no clip
+        vmov.i8         q14, #0
+        vmov.i8         q15, #255
+2:
+
+        mov             r10, #GRAIN_WIDTH      // grain_lut stride
+
+        add             r5,  r5,  #(3 + (2 >> \sx)*3) // grain_lut += 9 or 6
+.if \sy
+        add             r5,  r5,  r10, lsl #2  // grain_lut += 4 * grain_stride
+        add             r5,  r5,  r10, lsl #1  // grain_lut += 2 * grain_stride
+.else
+        add             r5,  r5,  r10, lsl #3  // grain_lut += 8 * grain_stride
+        add             r5,  r5,  r10          // grain_lut += grain_stride
+.endif
+
+        ldr             r12, [r8, #8]          // offsets[1][0]
+        calc_offset     r12, r4,  r12, \sx, \sy
+        add_offset      r4,  r12, r4,  r5,  r10
+
+        ldr             r12, [r8, #4]          // offsets[0][1]
+        calc_offset     r12, lr,  r12, \sx, \sy
+        add_offset      lr,  r12, lr,  r5,  r10
+
+        ldr             r12, [r8, #12]         // offsets[1][1]
+        calc_offset     r12, r11, r12, \sx, \sy
+        add_offset      r11, r12, r11, r5,  r10
+
+        ldr             r8,  [r8]              // offsets[0][0]
+        calc_offset     r8,  r12, r8,  \sx, \sy
+        add_offset      r5,  r8,  r12, r5,  r10
+
+        add             r4,  r4,  #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
+        add             r8,  lr,  r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
+        add             r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
+        add             r11, r11, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
+
+        movrel_local    r12, overlap_coeffs_\sx
+        ldr             lr,  [sp, #132]        // type
+
+        vld1.8          {d24, d25}, [r12, :128] // overlap_coeffs
+
+        movrel_local    r12, L(fguv_loop_sx\sx\()_tbl)
+#if CONFIG_THUMB
+        // This uses movrel_local instead of adr above, because the target
+        // can be out of range for adr. But movrel_local leaves the thumb bit
+        // set on COFF (but probably wouldn't if building for thumb on ELF),
+        // thus try to clear the bit for robustness.
+        bic             r12, r12, #1
+#endif
+
+        tst             lr,  #1
+        ldr             lr,  [r12, lr,  lsl #2]
+
+        add             r12, r12, lr
+
+        beq             1f
+        // y overlap
+        sub             lr,  r9,  #(2 >> \sy)  // backup remaining h
+        mov             r9,  #(2 >> \sy)
+
+1:
+
+.if \sy
+        vmov.i8         d6,  #23
+        vmov.i8         d7,  #22
+.else
+        vmov.i8         d6,  #27
+        vmov.i8         d7,  #17
+.endif
+
+.if \sy
+        add             r7,  r7,  r7           // luma_stride *= 2
+.endif
+
+        bx              r12
+endfunc
+.endm
+
+fguv 420, 1, 1
+fguv 422, 1, 0
+fguv 444, 0, 0
+
+function fguv_loop_sx0_neon
+L(fguv_loop_sx0_tbl):
+        .word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+        .word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+        .word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+        .word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+        .word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+        .word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+        .word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+        .word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+
+.macro fguv_loop_sx0 csfl, ox, oy
+L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
+.if \oy
+        mov             r12, lr
+.endif
+1:
+.if \ox
+        vld1.8          {d8},       [r4],        r10 // grain_lut old
+.endif
+.if \oy
+        vld1.8          {q8, q9},   [r8],        r10 // grain_lut top
+.endif
+.if \ox && \oy
+        vld1.8          {d10},      [r11],       r10 // grain_lut top old
+.endif
+        vld1.8          {q0,  q1},  [r6, :128],  r7  // luma
+        vld1.8          {q10, q11}, [r5],        r10 // grain_lut
+
+.if \ox
+        vmull.s8        q4,  d8,  d24
+        vmlal.s8        q4,  d20, d25
+.endif
+
+.if \oy
+.if \ox
+        vmull.s8        q5,  d10, d24
+        vmlal.s8        q5,  d16, d25
+        vqrshrn.s16     d20, q4,  #5
+        vqrshrn.s16     d16, q5,  #5
+.endif
+
+        vmull.s8        q4,  d20, d7
+        vmull.s8        q5,  d21, d7
+        vmull.s8        q6,  d22, d7
+        vmull.s8        q7,  d23, d7
+        vmlal.s8        q4,  d16, d6
+        vmlal.s8        q5,  d17, d6
+        vmlal.s8        q6,  d18, d6
+        vmlal.s8        q7,  d19, d6
+        vqrshrn.s16     d20, q4,  #5
+        vqrshrn.s16     d21, q5,  #5
+        vqrshrn.s16     d22, q6,  #5
+        vqrshrn.s16     d23, q7,  #5
+.elseif \ox
+        vqrshrn.s16     d20, q4,  #5
+.endif
+.if !\csfl
+        vld1.8          {q8,  q9},  [r1, :128] // src
+        vmovl.u8        q4,  d0
+        vmovl.u8        q5,  d1
+        vmovl.u8        q6,  d2
+        vmovl.u8        q7,  d3
+        vmovl.u8        q0,  d16
+        vmovl.u8        q1,  d17
+        vmovl.u8        q8,  d18
+        vmovl.u8        q9,  d19
+        vmul.i16        q4,  q4,  d4[0]
+        vmul.i16        q5,  q5,  d4[0]
+        vmul.i16        q6,  q6,  d4[0]
+        vmul.i16        q7,  q7,  d4[0]
+        vmul.i16        q0,  q0,  d4[1]
+        vmul.i16        q1,  q1,  d4[1]
+        vmul.i16        q8,  q8,  d4[1]
+        vmul.i16        q9,  q9,  d4[1]
+        vqadd.s16       q4,  q4,  q0
+        vqadd.s16       q5,  q5,  q1
+        vqadd.s16       q6,  q6,  q8
+        vqadd.s16       q7,  q7,  q9
+        vdup.16         q0,  d4[2]
+        vshr.s16        q4,  q4,  #6
+        vshr.s16        q5,  q5,  #6
+        vshr.s16        q6,  q6,  #6
+        vshr.s16        q7,  q7,  #6
+        vadd.i16        q4,  q4,  q0
+        vadd.i16        q5,  q5,  q0
+        vadd.i16        q6,  q6,  q0
+        vadd.i16        q7,  q7,  q0
+        vqmovun.s16     d0,  q4
+        vqmovun.s16     d1,  q5
+        vqmovun.s16     d2,  q6
+        vqmovun.s16     d3,  q7
+.endif
+
+        bl              gather32_neon
+
+        vld1.8          {q0,  q1},  [r1, :128], r2 // src
+
+        vmovl.s8        q8,  d20       // grain
+        vmovl.s8        q9,  d21
+        vmovl.s8        q10, d22
+        vmovl.s8        q11, d23
+
+        vmovl.u8        q6,  d8        // scaling
+        vmovl.u8        q7,  d9
+        vmovl.u8        q4,  d10
+        vmovl.u8        q5,  d11
+
+        vmul.i16        q8,  q8,  q6   // scaling * grain
+        vmul.i16        q9,  q9,  q7
+        vmul.i16        q10, q10, q4
+        vmul.i16        q11, q11, q5
+
+        vrshl.s16       q8,  q8,  q13  // round2(scaling * grain, scaling_shift)
+        vrshl.s16       q9,  q9,  q13
+        vrshl.s16       q10, q10, q13
+        vrshl.s16       q11, q11, q13
+
+        vaddw.u8        q8,  q8,  d0   // *src + noise
+        vaddw.u8        q9,  q9,  d1
+        vaddw.u8        q10, q10, d2
+        vaddw.u8        q11, q11, d3
+
+        vqmovun.s16     d0,  q8
+        vqmovun.s16     d1,  q9
+        vqmovun.s16     d2,  q10
+        vqmovun.s16     d3,  q11
+
+        vmax.u8         q0,  q0,  q14
+        vmax.u8         q1,  q1,  q14
+        vmin.u8         q0,  q0,  q15
+        vmin.u8         q1,  q1,  q15
+
+        subs            r9,  r9,  #1
+.if \oy
+        vdup.8          d6,  d25[0]
+        vdup.8          d7,  d25[1]
+.endif
+
+        vst1.8          {q0, q1}, [r0, :128], r2 // dst
+        bgt             1b
+
+.if \oy
+        cmp             r12, #0
+        mov             r9,  r12               // restore actual remaining h
+        bgt             L(fguv_loop_sx0_csfl\csfl\()_\ox\()0)
+.endif
+        b               9f
+.endm
+        fguv_loop_sx0   0, 0, 0
+        fguv_loop_sx0   0, 0, 1
+        fguv_loop_sx0   0, 1, 0
+        fguv_loop_sx0   0, 1, 1
+        fguv_loop_sx0   1, 0, 0
+        fguv_loop_sx0   1, 0, 1
+        fguv_loop_sx0   1, 1, 0
+        fguv_loop_sx0   1, 1, 1
+
+9:
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+
+function fguv_loop_sx1_neon
+L(fguv_loop_sx1_tbl):
+        .word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+        .word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+        .word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+        .word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+        .word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+        .word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+        .word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+        .word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+
+.macro fguv_loop_sx1 csfl, ox, oy
+L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
+.if \oy
+        mov             r12, lr
+.endif
+1:
+.if \ox
+        vld1.8          {d8},       [r4],        r10 // grain_lut old
+.endif
+.if \oy
+        vld1.8          {q8},       [r8],        r10 // grain_lut top
+.endif
+.if \ox && \oy
+        vld1.8          {d10},      [r11],       r10 // grain_lut top old
+.endif
+        vld1.8          {q0,  q1},  [r6, :128],  r7  // luma
+        vld1.8          {q10},      [r5],        r10 // grain_lut
+        vld1.8          {q11},      [r1, :128],  r2  // src
+
+.if \ox
+        vmull.s8        q4,  d8,  d24
+        vmlal.s8        q4,  d20, d25
+.endif
+
+        vpaddl.u8       q0,  q0
+        vpaddl.u8       q1,  q1
+.if \oy
+.if \ox
+        vmull.s8        q5,  d10, d24
+        vmlal.s8        q5,  d16, d25
+        vqrshrn.s16     d20, q4,  #5
+        vqrshrn.s16     d16, q5,  #5
+.endif
+
+        vmull.s8        q4,  d20, d7
+        vmull.s8        q5,  d21, d7
+        vmlal.s8        q4,  d16, d6
+        vmlal.s8        q5,  d17, d6
+        vqrshrn.s16     d20, q4,  #5
+        vqrshrn.s16     d21, q5,  #5
+.elseif \ox
+        vqrshrn.s16     d20, q4,  #5
+.endif
+.if \csfl
+        vrshrn.u16      d0,  q0,  #1
+        vrshrn.u16      d1,  q1,  #1
+.else
+        vrshr.u16       q4,  q0,  #1
+        vrshr.u16       q5,  q1,  #1
+        vmovl.u8        q0,  d22
+        vmovl.u8        q1,  d23
+        vmul.i16        q4,  q4,  d4[0]
+        vmul.i16        q5,  q5,  d4[0]
+        vmul.i16        q0,  q0,  d4[1]
+        vmul.i16        q1,  q1,  d4[1]
+        vqadd.s16       q4,  q4,  q0
+        vqadd.s16       q5,  q5,  q1
+        vdup.16         q0,  d4[2]
+        vshr.s16        q4,  q4,  #6
+        vshr.s16        q5,  q5,  #6
+        vadd.i16        q4,  q4,  q0
+        vadd.i16        q5,  q5,  q0
+        vqmovun.s16     d0,  q4
+        vqmovun.s16     d1,  q5
+.endif
+
+        bl              gather16_neon
+
+        vmovl.s8        q8,  d20       // grain
+        vmovl.s8        q9,  d21
+
+        vmovl.u8        q6,  d8        // scaling
+        vmovl.u8        q7,  d9
+
+        vmul.i16        q8,  q8,  q6   // scaling * grain
+        vmul.i16        q9,  q9,  q7
+
+        vrshl.s16       q8,  q8,  q13  // round2(scaling * grain, scaling_shift)
+        vrshl.s16       q9,  q9,  q13
+
+        vaddw.u8        q8,  q8,  d22  // *src + noise
+        vaddw.u8        q9,  q9,  d23
+
+        vqmovun.s16     d0,  q8
+        vqmovun.s16     d1,  q9
+
+        vmax.u8         q0,  q0,  q14
+        vmin.u8         q0,  q0,  q15
+
+        subs            r9,  r9,  #1
+.if \oy
+        vswp            d6,  d7
+.endif
+        vst1.8          {q0}, [r0, :128], r2 // dst
+        bgt             1b
+
+.if \oy
+        cmp             r12, #0
+        mov             r9,  r12               // restore actual remaining h
+        bgt             L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
+.endif
+
+        b               9f
+.endm
+        fguv_loop_sx1   0, 0, 0
+        fguv_loop_sx1   0, 0, 1
+        fguv_loop_sx1   0, 1, 0
+        fguv_loop_sx1   0, 1, 1
+        fguv_loop_sx1   1, 0, 0
+        fguv_loop_sx1   1, 0, 1
+        fguv_loop_sx1   1, 1, 0
+        fguv_loop_sx1   1, 1, 1
+
+9:
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
diff --git a/third_party/dav1d/src/arm/32/filmgrain16.S b/third_party/dav1d/src/arm/32/filmgrain16.S
new file mode 100644
index 0000000000000..6c36caceae5a5
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/filmgrain16.S
@@ -0,0 +1,2137 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "src/arm/asm-offsets.h"
+
+#define GRAIN_WIDTH 82
+#define GRAIN_HEIGHT 73
+
+#define SUB_GRAIN_WIDTH 44
+#define SUB_GRAIN_HEIGHT 38
+
+.macro increment_seed steps, shift=1
+        lsr             r11, r2,  #3
+        lsr             r12, r2,  #12
+        lsr             lr,  r2,  #1
+        eor             r11, r2,  r11                     // (r >> 0) ^ (r >> 3)
+        eor             r12, r12, lr                      // (r >> 12) ^ (r >> 1)
+        eor             r11, r11, r12                     // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1)
+.if \shift
+        lsr             r2,  r2,  #\steps
+.endif
+        and             r11, r11, #((1 << \steps) - 1)    // bit
+.if \shift
+        orr             r2,  r2,  r11, lsl #(16 - \steps) // *state
+.else
+        orr             r2,  r2,  r11, lsl #16            // *state
+.endif
+.endm
+
+.macro read_rand dest, bits, age
+        ubfx            \dest,  r2,   #16 - \bits - \age, #\bits
+.endm
+
+.macro read_shift_rand dest, bits
+        ubfx            \dest,  r2,   #17 - \bits, #\bits
+        lsr             r2,  r2,  #1
+.endm
+
+// special calling convention:
+// r2 holds seed
+// r3 holds dav1d_gaussian_sequence
+// clobbers r11-r12
+// returns in d0-d1
+function get_gaussian_neon
+        push            {r5-r6,lr}
+        increment_seed  4
+        read_rand       r5,  11,  3
+        read_rand       r6,  11,  2
+        add             r5,  r3,  r5,  lsl #1
+        add             r6,  r3,  r6,  lsl #1
+        vld1.16         {d0[0]}, [r5]
+        read_rand       r5,  11,  1
+        vld1.16         {d0[1]}, [r6]
+        add             r5,  r3,  r5,  lsl #1
+        read_rand       r6, 11,  0
+        increment_seed  4
+        add             r6,  r3,  r6,  lsl #1
+        vld1.16         {d0[2]}, [r5]
+        read_rand       r5,  11,  3
+        vld1.16         {d0[3]}, [r6]
+        add             r5,  r3,  r5,  lsl #1
+        read_rand       r6,  11,  2
+        vld1.16         {d1[0]}, [r5]
+        add             r6,  r3,  r6,  lsl #1
+        read_rand       r5,  11,  1
+        vld1.16         {d1[1]}, [r6]
+        read_rand       r6,  11,  0
+        add             r5,  r3,  r5,  lsl #1
+        add             r6,  r3,  r6,  lsl #1
+        vld1.16         {d1[2]}, [r5]
+        vld1.16         {d1[3]}, [r6]
+        pop             {r5-r6,pc}
+endfunc
+
+function get_grain_2_neon
+        push            {r11,lr}
+        increment_seed  2
+        read_rand       r11, 11,  1
+        read_rand       r12, 11,  0
+        add             r11, r3,  r11, lsl #1
+        add             r12, r3,  r12, lsl #1
+        vld1.16         {d0[0]}, [r11]
+        vld1.16         {d0[1]}, [r12]
+        vrshl.s16       d0,  d0,  d30
+        pop             {r11,pc}
+endfunc
+
+.macro get_grain_2 dst
+        bl              get_grain_2_neon
+.ifnc \dst, d0
+        vmov            \dst, d0
+.endif
+.endm
+
+function get_grain_4_neon
+        push            {r11,lr}
+        increment_seed  4
+        read_rand       r11, 11,  3
+        read_rand       r12, 11,  2
+        add             r11, r3,  r11, lsl #1
+        add             r12, r3,  r12, lsl #1
+        vld1.16         {d0[0]}, [r11]
+        read_rand       r11, 11,  1
+        vld1.16         {d0[1]}, [r12]
+        read_rand       r12, 11,  0
+        add             r11, r3,  r11, lsl #1
+        add             r12, r3,  r12, lsl #1
+        vld1.16         {d0[2]}, [r11]
+        vld1.16         {d0[3]}, [r12]
+        vrshl.s16       d0,  d0,  d30
+        pop             {r11,pc}
+endfunc
+
+.macro get_grain_4 dst
+        bl              get_grain_4_neon
+.ifnc \dst, d0
+        vmov            \dst, d0
+.endif
+.endm
+
+// r1 holds the number of entries to produce
+// r6, r8 and r10 hold the previous output entries
+// q0 holds the vector of produced entries
+// q1 holds the input vector of sums from above
+.macro output_lag n
+function output_lag\n\()_neon
+        push            {r0, lr}
+.if \n == 1
+        mvn             lr,  r5                   // grain_min = ~grain_max
+.else
+        mov             r0,  #1
+        mov             lr,  #1
+        sub             r7,  r7,  #1
+        sub             r9,  r9,  #1
+        lsl             r0,  r0,  r7
+        lsl             lr,  lr,  r9
+        add             r7,  r7,  #1
+        add             r9,  r9,  #1
+.endif
+1:
+        read_shift_rand r12, 11
+        vmov.32         r11, d2[0]
+        lsl             r12, r12, #1
+        vext.8          q0,  q0,  q0,  #2
+        ldrsh           r12, [r3, r12]
+.if \n == 1
+        mla             r11, r6,  r4,  r11        // sum (above) + *coeff * prev output
+        add             r6,  r11, r8              // 1 << (ar_coeff_shift - 1)
+        add             r12, r12, r10
+        asr             r6,  r6,  r7              // >> ar_coeff_shift
+        asr             r12, r12, r9              // >> (4 - bitdepth_min_8 + grain_scale_shift)
+        add             r6,  r6,  r12
+        cmp             r6,  r5
+.elseif \n == 2
+        mla             r11, r8,  r4,  r11        // sum (above) + *coeff * prev output 1
+        mla             r11, r6,  r10, r11        // += *coeff * prev output 2
+        mov             r8,  r6
+        add             r6,  r11, r0              // 1 << (ar_coeff_shift - 1)
+        add             r12, r12, lr              // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1)
+        asr             r6,  r6,  r7              // >> ar_coeff_shift
+        asr             r12, r12, r9              // >> (4 - bitdepth_min_8 + grain_scale_shift)
+        add             r6,  r6,  r12
+        push            {lr}
+        cmp             r6,  r5
+        mvn             lr,  r5                   // grain_min = ~grain_max
+.else
+        push            {r1-r3}
+        sbfx            r1,  r4,  #0,  #8
+        sbfx            r2,  r4,  #8,  #8
+        sbfx            r3,  r4,  #16, #8
+        mla             r11, r10, r1,  r11        // sum (above) + *coeff * prev output 1
+        mla             r11, r8,  r2,  r11        // sum (above) + *coeff * prev output 2
+        mla             r11, r6,  r3,  r11        // += *coeff * prev output 3
+        pop             {r1-r3}
+        mov             r10, r8
+        mov             r8,  r6
+
+        add             r6,  r11, r0              // 1 << (ar_coeff_shift - 1)
+        add             r12, r12, lr              // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1)
+        asr             r6,  r6,  r7              // >> ar_coeff_shift
+        asr             r12, r12, r9              // >> (4 - bitdepth_min_8 + grain_scale_shift)
+        add             r6,  r6,  r12
+        push            {lr}
+        cmp             r6,  r5
+        mvn             lr,  r5                   // grain_min = ~grain_max
+.endif
+        it              gt
+        movgt           r6,  r5
+        cmp             r6,  lr
+        it              lt
+        movlt           r6,  lr
+.if \n >= 2
+        pop             {lr}
+.endif
+        subs            r1,  r1,  #1
+        vext.8          q1,  q1,  q1,  #4
+        vmov.16         d1[3], r6
+        bgt             1b
+        pop             {r0, pc}
+endfunc
+.endm
+
+output_lag 1
+output_lag 2
+output_lag 3
+
+
+function sum_lag1_above_neon
+        sub             r12, r0,  #1*GRAIN_WIDTH*2 - 16
+        vld1.16         {q10}, [r12] // load top right
+
+        vext.8          q0,  q8,  q9,  #14 // top left, top mid
+        vext.8          q1,  q9,  q10, #2  // top left, top mid
+
+        vmull.s16       q2,  d18, d28
+        vmlal.s16       q2,  d0,  d27
+        vmlal.s16       q2,  d2,  d29
+        vmull.s16       q3,  d19, d28
+        vmlal.s16       q3,  d1,  d27
+        vmlal.s16       q3,  d3,  d29
+
+        vmov            q8,  q9
+        vmov            q9,  q10
+
+        bx              lr
+endfunc
+
+.macro sum_lag_n_body lag, type, uv_layout, edge, elems, uv_coeff
+.ifc \lag\()_\edge, lag3_left
+        bl              sum_lag3_left_above_neon
+.else
+        bl              sum_\lag\()_above_neon
+.endif
+.ifc \type, uv_420
+        vpush           {q6-q7}
+        add             r12, r11, #GRAIN_WIDTH*2
+        vld1.16         {q0, q1}, [r11]!
+        vld1.16         {q6, q7}, [r12]!
+        vpadd.i16       d0,  d0,  d1
+        vpadd.i16       d1,  d2,  d3
+        vpadd.i16       d12, d12, d13
+        vpadd.i16       d13, d14, d15
+        vadd.i16        q0,  q0,  q6
+        vpop            {q6-q7}
+        vrshr.s16       q0,  q0,  #2
+.endif
+.ifc \type, uv_422
+        vld1.16         {q0, q1}, [r11]!
+        vpadd.i16       d0,  d0,  d1
+        vpadd.i16       d1,  d2,  d3
+        vrshr.s16       q0,  q0,  #1
+.endif
+.ifc \type, uv_444
+        vld1.16         {q0}, [r11]!
+.endif
+.if \uv_layout
+.ifnb \uv_coeff
+        vdup.8          d13, \uv_coeff
+        vmovl.s8        q6,  d13
+.endif
+        vmlal.s16       q2,  d0,  d13
+        vmlal.s16       q3,  d1,  d13
+.endif
+.if \uv_layout && \elems == 8
+        b               sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 444 && \elems == 7
+        b               sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 422 && \elems == 1
+        b               sum_\lag\()_uv_420_\edge\()_start
+.else
+sum_\lag\()_\type\()_\edge\()_start:
+        push            {r11}
+.if \elems > 4
+.ifc \edge, left
+        increment_seed  4
+        read_rand       r11, 11,  3
+        read_rand       r12, 11,  2
+        add             r11, r3,  r11, lsl #1
+        add             r12, r3,  r12, lsl #1
+        vld1.16         {d1[1]}, [r11]
+        read_rand       r11, 11,  1
+        vld1.16         {d1[2]}, [r12]
+        add             r11, r3,  r11, lsl #1
+        vld1.16         {d1[3]}, [r11]
+        lsl             r2,  r2,  #1             // shift back the state as if we'd done increment_seed with shift=0
+        vrshl.s16       d1,  d1,  d30
+        vext.8          q2,  q2,  q2,  #12
+.ifc \lag, lag3
+        vmov.s16        r10, d1[1]
+.endif
+.ifnc \lag, lag1
+        vmov.s16        r8,  d1[2]
+.endif
+        vmov.s16        r6,  d1[3]
+
+        vmov            q1,  q2
+        mov             r1,  #1
+        bl              output_\lag\()_neon
+.else
+        increment_seed  4, shift=0
+        vmov            q1,  q2
+        mov             r1,  #4
+        bl              output_\lag\()_neon
+.endif
+
+        increment_seed  4, shift=0
+        vmov            q1,  q3
+.ifc \edge, right
+        mov             r1,  #3
+        bl              output_\lag\()_neon
+        read_shift_rand r12, 11
+        add             r12, r3,  r12, lsl #1
+        vld1.16         {d2[0]}, [r12]
+        vrshl.s16       d2,  d2,  d30
+        vext.8          q0,  q0,  q1,  #2
+.else
+        mov             r1,  #4
+        bl              output_\lag\()_neon
+.endif
+.else
+        // elems == 1
+        increment_seed  4, shift=0
+        vmov            q1,  q2
+        mov             r1,  #1
+        bl              output_\lag\()_neon
+        lsr             r2,  r2,  #3
+
+        read_rand       r11, 11,  2
+        read_rand       r12, 11,  1
+        add             r11, r3,  r11, lsl #1
+        add             r12, r3,  r12, lsl #1
+        vld1.16         {d2[0]}, [r11]
+        read_rand       r11, 11,  0
+        vld1.16         {d2[1]}, [r12]
+        add             r11, r3,  r11, lsl #1
+        vld1.16         {d2[2]}, [r11]
+        vrshl.s16       d2,  d2,  d30
+        vext.8          q0,  q0,  q1,  #14
+.endif
+        vst1.16         {q0}, [r0]!
+        pop             {r11}
+        pop             {r1, pc}
+.endif
+.endm
+
+.macro sum_lag1_func type, uv_layout, edge, elems=8
+function sum_\type\()_lag1_\edge\()_neon
+        push            {r1, lr}
+.ifc \edge, left
+        sub             r12, r0,  #1*GRAIN_WIDTH*2
+        vld1.8          {q9},  [r12] // load the previous block right above
+.endif
+        sum_lag_n_body  lag1, \type, \uv_layout, \edge, \elems
+endfunc
+.endm
+
+sum_lag1_func y,      0,   left
+sum_lag1_func y,      0,   mid
+sum_lag1_func y,      0,   right, 7
+sum_lag1_func uv_444, 444, left
+sum_lag1_func uv_444, 444, mid
+sum_lag1_func uv_444, 444, right, 7
+sum_lag1_func uv_422, 422, left
+sum_lag1_func uv_422, 422, mid
+sum_lag1_func uv_422, 422, right, 1
+sum_lag1_func uv_420, 420, left
+sum_lag1_func uv_420, 420, mid
+sum_lag1_func uv_420, 420, right, 1
+
+
+function sum_lag2_above_neon
+        push            {lr}
+        sub             r12, r0,  #2*GRAIN_WIDTH*2 - 16
+        sub             lr,  r0,  #1*GRAIN_WIDTH*2 - 16
+        vld1.16         {q10}, [r12] // load top right
+        vld1.16         {q13}, [lr]
+
+        vdup.8          d10, d28[0]
+        vext.8          q0,  q8,  q9,  #12 // top left, top mid
+        vdup.8          d12, d28[1]
+        vext.8          q1,  q8,  q9,  #14
+        vdup.8          d14, d28[3]
+        vext.8          q4,  q9,  q10, #2  // top mid, top right
+        vmovl.s8        q5,  d10
+        vmovl.s8        q6,  d12
+        vmovl.s8        q7,  d14
+
+        vmull.s16       q2,  d0,  d10
+        vmlal.s16       q2,  d2,  d12
+        vmlal.s16       q2,  d8,  d14
+        vmull.s16       q3,  d1,  d10
+        vmlal.s16       q3,  d3,  d12
+        vmlal.s16       q3,  d9,  d14
+
+        vdup.8          d10, d28[4]
+        vext.8          q0,  q9,  q10, #4  // top mid, top right
+        vdup.8          d12, d28[5]
+        vext.8          q1,  q11, q12, #12 // top left, top mid
+        vdup.8          d14, d28[6]
+        vext.8          q4,  q11, q12, #14
+        vmovl.s8        q5,  d10
+        vmovl.s8        q6,  d12
+        vmovl.s8        q7,  d14
+
+        vmlal.s16       q2,  d0,  d10
+        vmlal.s16       q2,  d2,  d12
+        vmlal.s16       q2,  d8,  d14
+        vmlal.s16       q3,  d1,  d10
+        vmlal.s16       q3,  d3,  d12
+        vmlal.s16       q3,  d9,  d14
+
+        vdup.8          d10, d29[0]
+        vext.8          q0,  q12, q13, #2  // top mid, top right
+        vdup.8          d12, d29[1]
+        vext.8          q1,  q12, q13, #4
+
+        vdup.8          d14, d28[2]
+        vdup.8          d8,  d28[7]
+
+        vmovl.s8        q5,  d10
+        vmovl.s8        q6,  d12
+        vmovl.s8        q7,  d14
+        vmovl.s8        q4,  d8
+
+        vmlal.s16       q2,  d0,  d10
+        vmlal.s16       q2,  d2,  d12
+        vmlal.s16       q2,  d18, d14
+        vmlal.s16       q2,  d24, d8
+        vmlal.s16       q3,  d1,  d10
+        vmlal.s16       q3,  d3,  d12
+        vmlal.s16       q3,  d19, d14
+        vmlal.s16       q3,  d25, d8
+
+        vmov            q8,  q9
+        vmov            q9,  q10
+
+        vmov            q11, q12
+        vmov            q12, q13
+
+        pop             {pc}
+endfunc
+
+.macro sum_lag2_func type, uv_layout, edge, elems=8
+function sum_\type\()_lag2_\edge\()_neon
+        push            {r1, lr}
+.ifc \edge, left
+        sub             r12, r0,  #2*GRAIN_WIDTH*2
+        sub             lr,  r0,  #1*GRAIN_WIDTH*2
+        vld1.16         {q9},  [r12] // load the previous block right above
+        vld1.16         {q12}, [lr]
+.endif
+        sum_lag_n_body  lag2, \type, \uv_layout, \edge, \elems, uv_coeff=d29[4]
+endfunc
+.endm
+
+sum_lag2_func y,      0,   left
+sum_lag2_func y,      0,   mid
+sum_lag2_func y,      0,   right, 7
+sum_lag2_func uv_444, 444, left
+sum_lag2_func uv_444, 444, mid
+sum_lag2_func uv_444, 444, right, 7
+sum_lag2_func uv_422, 422, left
+sum_lag2_func uv_422, 422, mid
+sum_lag2_func uv_422, 422, right, 1
+sum_lag2_func uv_420, 420, left
+sum_lag2_func uv_420, 420, mid
+sum_lag2_func uv_420, 420, right, 1
+
+
+function sum_lag3_left_above_neon
+        // A separate codepath for the left edge, to avoid reading outside
+        // of the edge of the buffer.
+        sub             r12, r0,  #3*GRAIN_WIDTH*2
+        vld1.8          {q11, q12}, [r12]
+        vext.8          q12, q11, q12, #10
+        vext.8          q11, q11, q11, #10
+        b               sum_lag3_above_start
+endfunc
+
+function sum_lag3_above_neon
+        movw            r12, #(3*GRAIN_WIDTH + 3)*2
+        sub             r12, r0,  r12
+        vld1.8          {q11, q12}, [r12]
+
+sum_lag3_above_start:
+        vdup.8          d12, d26[0]
+        vext.8          q1,  q11, q12, #2
+        vdup.8          d14, d26[1]
+        vext.8          q4,  q11, q12, #4
+        vdup.8          d16, d26[2]
+        vext.8          q5,  q11, q12, #6
+        vdup.8          d18, d26[3]
+        vmovl.s8        q6,  d12
+        vmovl.s8        q7,  d14
+        vmovl.s8        q8,  d16
+        vmovl.s8        q9,  d18
+
+        movw            r12, #(2*GRAIN_WIDTH + 3)*2
+        sub             r12, r0,  r12
+
+        vmull.s16       q2,  d22, d12
+        vmlal.s16       q2,  d2,  d14
+        vmlal.s16       q2,  d8,  d16
+        vmlal.s16       q2,  d10, d18
+        vmull.s16       q3,  d23, d12
+        vmlal.s16       q3,  d3,  d14
+        vmlal.s16       q3,  d9,  d16
+        vmlal.s16       q3,  d11, d18
+
+        vdup.8          d12, d26[4]
+        vext.8          q0,  q11, q12, #8
+        vdup.8          d14, d26[5]
+        vext.8          q1,  q11, q12, #10
+        vdup.8          d16, d26[6]
+        vext.8          q4,  q11, q12, #12
+        vld1.8          {q11, q12}, [r12]
+        vdup.8          d18, d26[7]
+        vmovl.s8        q6,  d12
+        vmovl.s8        q7,  d14
+        vmovl.s8        q8,  d16
+        vmovl.s8        q9,  d18
+
+        vmlal.s16       q2,  d0,  d12
+        vmlal.s16       q2,  d2,  d14
+        vmlal.s16       q2,  d8,  d16
+        vmlal.s16       q2,  d22, d18
+        vmlal.s16       q3,  d1,  d12
+        vmlal.s16       q3,  d3,  d14
+        vmlal.s16       q3,  d9,  d16
+        vmlal.s16       q3,  d23, d18
+
+        vdup.8          d12, d27[0]
+        vext.8          q0,  q11, q12, #2
+        vdup.8          d14, d27[1]
+        vext.8          q1,  q11, q12, #4
+        vdup.8          d16, d27[2]
+        vext.8          q4,  q11, q12, #6
+        vdup.8          d18, d27[3]
+        vext.8          q5,  q11, q12, #8
+        vmovl.s8        q6,  d12
+        vmovl.s8        q7,  d14
+        vmovl.s8        q8,  d16
+        vmovl.s8        q9,  d18
+
+        sub             r12, r0,  #(1*GRAIN_WIDTH + 3)*2
+
+        vmlal.s16       q2,  d0,  d12
+        vmlal.s16       q2,  d2,  d14
+        vmlal.s16       q2,  d8,  d16
+        vmlal.s16       q2,  d10, d18
+        vmlal.s16       q3,  d1,  d12
+        vmlal.s16       q3,  d3,  d14
+        vmlal.s16       q3,  d9,  d16
+        vmlal.s16       q3,  d11, d18
+
+        vdup.8          d12, d27[4]
+        vext.8          q0,  q11, q12, #10
+        vdup.8          d14, d27[5]
+        vext.8          q1,  q11, q12, #12
+        vld1.8          {q11, q12}, [r12]
+        vdup.8          d16, d27[6]
+        vdup.8          d18, d27[7]
+        vmovl.s8        q6,  d12
+        vmovl.s8        q7,  d14
+        vext.8          q5,  q11, q12, #2
+        vmovl.s8        q8,  d16
+        vmovl.s8        q9,  d18
+
+        vmlal.s16       q2,  d0,  d12
+        vmlal.s16       q2,  d2,  d14
+        vmlal.s16       q2,  d22, d16
+        vmlal.s16       q2,  d10, d18
+        vmlal.s16       q3,  d1,  d12
+        vmlal.s16       q3,  d3,  d14
+        vmlal.s16       q3,  d23, d16
+        vmlal.s16       q3,  d11, d18
+
+        vdup.8          d12, d28[0]
+        vext.8          q0,  q11, q12, #4
+        vdup.8          d14, d28[1]
+        vext.8          q1,  q11, q12, #6
+        vdup.8          d16, d28[2]
+        vext.8          q4,  q11, q12, #8
+        vdup.8          d18, d28[3]
+        vext.8          q5,  q11, q12, #10
+        vmovl.s8        q6,  d12
+        vmovl.s8        q7,  d14
+        vmovl.s8        q8,  d16
+        vmovl.s8        q9,  d18
+
+        vmlal.s16       q2,  d0,  d12
+        vmlal.s16       q2,  d2,  d14
+        vmlal.s16       q2,  d8,  d16
+        vmlal.s16       q2,  d10, d18
+        vmlal.s16       q3,  d1,  d12
+        vmlal.s16       q3,  d3,  d14
+        vmlal.s16       q3,  d9,  d16
+        vmlal.s16       q3,  d11, d18
+
+        vdup.8          d12, d28[4]
+        vext.8          q0,  q11, q12, #12
+        vmovl.s8        q6,  d12
+
+        vmlal.s16       q2,  d0,  d12
+        vmlal.s16       q3,  d1,  d12
+
+        bx              lr
+endfunc
+
+.macro sum_lag3_func type, uv_layout, edge, elems=8
+function sum_\type\()_lag3_\edge\()_neon
+        push            {r1, lr}
+        sum_lag_n_body  lag3, \type, \uv_layout, \edge, \elems, uv_coeff=d29[0]
+endfunc
+.endm
+
+sum_lag3_func y,      0,   left
+sum_lag3_func y,      0,   mid
+sum_lag3_func y,      0,   right, 7
+sum_lag3_func uv_444, 444, left
+sum_lag3_func uv_444, 444, mid
+sum_lag3_func uv_444, 444, right, 7
+sum_lag3_func uv_422, 422, left
+sum_lag3_func uv_422, 422, mid
+sum_lag3_func uv_422, 422, right, 1
+sum_lag3_func uv_420, 420, left
+sum_lag3_func uv_420, 420, mid
+sum_lag3_func uv_420, 420, right, 1
+
+function generate_grain_rows_neon
+        push            {r10-r11,lr}
+1:
+        mov             r10, #80
+2:
+        bl              get_gaussian_neon
+        vrshl.s16       q0,  q0,  q15
+        subs            r10, r10, #8
+        vst1.16         {q0}, [r0]!
+        bgt             2b
+        get_grain_2     d0
+        subs            r1,  r1,  #1
+        vst1.32         {d0[0]}, [r0]!
+        bgt             1b
+        pop             {r10-r11,pc}
+endfunc
+
+function generate_grain_rows_44_neon
+        push            {r10-r11,lr}
+1:
+        mov             r10, #40
+2:
+        bl              get_gaussian_neon
+        vrshl.s16       q0,  q0,  q15
+        subs            r10, r10, #8
+        vst1.16         {q0}, [r0]!
+        bgt             2b
+        get_grain_4     d0
+        subs            r1,  r1,  #1
+        vst1.16         {d0}, [r0]
+        add             r0,  r0,  #GRAIN_WIDTH*2-80
+        bgt             1b
+        pop             {r10-r11,pc}
+endfunc
+
+function gen_grain_uv_444_lag0_neon
+        vld1.16         {q3}, [r11]!
+gen_grain_uv_lag0_8_start:
+        push            {r11,lr}
+        bl              get_gaussian_neon
+        vrshl.s16       q0,  q0,  q15
+gen_grain_uv_lag0_8_add:
+        vand            q3,  q3,  q1
+        vmull.s16       q2,  d6,  d22
+        vmull.s16       q3,  d7,  d22
+        vrshl.s32       q2,  q2,  q12
+        vrshl.s32       q3,  q3,  q12
+        vqmovn.s32      d4,  q2
+        vqmovn.s32      d5,  q3
+        vqadd.s16       q2,  q2,  q0
+        vmin.s16        q2,  q2,  q9
+        vmax.s16        q2,  q2,  q10
+        vst1.16         {q2}, [r0]!
+        pop             {r11,pc}
+endfunc
+
+function gen_grain_uv_420_lag0_8_neon
+        add             r12, r11, #GRAIN_WIDTH*2
+        vld1.16         {q2,q3}, [r11]!
+        vld1.16         {q4,q5}, [r12]
+        vpadd.i16       d4,  d4,  d5
+        vpadd.i16       d5,  d6,  d7
+        vpadd.i16       d8,  d8,  d9
+        vpadd.i16       d9,  d10, d11
+        vadd.i16        q2,  q2,  q4
+        vrshr.s16       q3,  q2,  #2
+        b               gen_grain_uv_lag0_8_start
+endfunc
+
+function gen_grain_uv_422_lag0_8_neon
+        vld1.16         {q2,q3}, [r11]!
+        vpadd.i16       d4,  d4,  d5
+        vpadd.i16       d5,  d6,  d7
+        vrshr.s16       q3,  q2,  #1
+        b               gen_grain_uv_lag0_8_start
+endfunc
+
+function gen_grain_uv_420_lag0_4_neon
+        add             r12, r11, #GRAIN_WIDTH*2
+        vld1.16         {q2}, [r11]
+        vld1.16         {q0}, [r12]
+        add             r11, r11, #32
+        vpadd.i16       d4,  d4,  d5
+        vpadd.i16       d0,  d0,  d1
+        vadd.i16        d4,  d4,  d0
+        vrshr.s16       d6,  d4,  #2
+        push            {r11,lr}
+        get_grain_4     d0
+        b               gen_grain_uv_lag0_8_add
+endfunc
+
+function gen_grain_uv_422_lag0_4_neon
+        vld1.16         {q2}, [r11]
+        add             r11, r11, #32
+        vpadd.i16       d4,  d4,  d5
+        vrshr.s16       d6,  d4,  #1
+        push            {r11,lr}
+        get_grain_4     d0
+        b               gen_grain_uv_lag0_8_add
+endfunc
+
+.macro gen_grain_82 type
+function generate_grain_\type\()_16bpc_neon, export=1
+        push            {r4-r11,lr}
+
+.ifc \type, uv_444
+        ldr             r4,  [sp, #36]
+        mov             r12, r3
+        mov             lr,  #28
+        add             r11, r1,  #3*GRAIN_WIDTH*2
+        mov             r1,  r2
+        mul             r12, r12, lr
+        clz             lr,  r4
+.else
+        clz             lr,  r2
+.endif
+        movrel          r3,  X(gaussian_sequence)
+        sub             lr,  lr,  #24 // -bitdepth_min_8
+        ldr             r2,  [r1, #FGD_SEED]
+        ldr             r9,  [r1, #FGD_GRAIN_SCALE_SHIFT]
+.ifc \type, y
+        add             r4,  r1,  #FGD_AR_COEFFS_Y
+.else
+        add             r4,  r1,  #FGD_AR_COEFFS_UV
+.endif
+        add             r9,  r9,  lr // grain_scale_shift - bitdepth_min_8
+        adr             r5,  L(gen_grain_\type\()_tbl)
+        ldr             r6,  [r1, #FGD_AR_COEFF_LAG]
+        add             r9,  r9,  #4
+        ldr             r6,  [r5, r6, lsl #2]
+        vdup.16         q15, r9    // 4 - bitdepth_min_8 + data->grain_scale_shift
+        add             r5,  r5,  r6
+        vneg.s16        q15, q15
+
+.ifc \type, uv_444
+        push            {lr}
+        cmp             r12, #0
+        movw            r10, #0x49d8
+        movw            lr,  #0xb524
+        // Intentionally using a separate register instead of moveq with an
+        // immediate constant, to avoid armv8 deprecated it instruction forms.
+        it              eq
+        moveq           r10, lr
+        add             r4,  r4,  r12       // Add offset to ar_coeffs_uv[1]
+        eor             r2,  r2,  r10
+        pop             {lr}
+.endif
+
+        ldr             r7,  [r1, #FGD_AR_COEFF_SHIFT]
+        neg             lr,  lr             // bitdepth_min_8
+        mov             r8,  #1
+        mov             r10, #1
+        lsl             r8,  r8,  r7        // 1 << ar_coeff_shift
+        lsl             r10, r10, r9        // 1 << (4 + data->grain_scale_shift)
+        lsr             r8,  r8,  #1        // 1 << (ar_coeff_shift - 1)
+        lsr             r10, r10, #1        // 1 << (4 + data->grain_scale_shift - 1)
+
+        bx              r5
+
+        .align 2
+L(gen_grain_\type\()_tbl):
+        .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+        .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+        .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+        .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+
+L(generate_grain_\type\()_lag0):
+.ifc \type, y
+        mov             r1,  #GRAIN_HEIGHT
+        bl              generate_grain_rows_neon
+.else
+        mov             r5,  #128
+        lsl             r5,  r5,  lr        //  128 << bitdepth_min_8
+        sub             r5,  r5,  #1        // (128 << bitdepth_min_8) - 1
+        mvn             r6,  r5             // grain_min = ~grain_max
+
+        mov             r1,  #3
+        bl              generate_grain_rows_neon
+        mov             r1,  #GRAIN_HEIGHT-3
+
+        vdup.32         q12, r7
+        vld1.8          {d22[]}, [r4]       // ar_coeffs_uv[0]
+        vmov.i8         q0,  #0
+        vmov.i8         q1,  #255
+        vdup.16         q9,  r5
+        vdup.16         q10, r6
+        vext.8          q13, q0,  q1,  #10
+        vext.8          q14, q1,  q0,  #2
+        vneg.s32        q12, q12
+        vmovl.s8        q11, d22
+
+1:
+        vmov            q1,  q13
+        bl              gen_grain_uv_444_lag0_neon // 8
+        vmov.i8         q1,  #255
+        bl              gen_grain_uv_444_lag0_neon // 16
+        bl              gen_grain_uv_444_lag0_neon // 24
+        bl              gen_grain_uv_444_lag0_neon // 32
+        bl              gen_grain_uv_444_lag0_neon // 40
+        bl              gen_grain_uv_444_lag0_neon // 48
+        bl              gen_grain_uv_444_lag0_neon // 56
+        bl              gen_grain_uv_444_lag0_neon // 64
+        bl              gen_grain_uv_444_lag0_neon // 72
+        vmov            q1,  q14
+        bl              gen_grain_uv_444_lag0_neon // 80
+        get_grain_2     d16
+        subs            r1,  r1,  #1
+        add             r11, r11, #4
+        vst1.32         {d16[0]}, [r0]!
+        bgt             1b
+.endif
+        pop             {r4-r11,pc}
+
+L(generate_grain_\type\()_lag1):
+        vpush           {q4-q7}
+        mov             r5,  #128
+        lsl             r5,  r5,  lr        //  128 << bitdepth_min_8
+        sub             r5,  r5,  #1        // (128 << bitdepth_min_8) - 1
+        vld1.8          {d27[]}, [r4]!      // ar_coeffs_y[0]
+        vld1.8          {d28[]}, [r4]!      // ar_coeffs_y[1]
+        vld1.8          {d29[]}, [r4]       // ar_coeffs_y[2]
+.ifc \type, y
+        ldrsb           r4,  [r4, #1]       // ar_coeffs_y[3]
+.else
+        add             r4,  r4,  #2
+.endif
+
+        mov             r1,  #3
+.ifc \type, uv_444
+        vld1.8          {d13[]}, [r4]       // ar_coeffs_uv[4]
+        ldrsb           r4,  [r4, #-1]      // ar_coeffs_uv[3]
+.endif
+        bl              generate_grain_rows_neon
+        vmovl.s8        q13, d27
+        vmovl.s8        q12, d29
+        vmovl.s8        q14, d28
+        vmov            d29, d24
+.ifc \type, uv_444
+        vmovl.s8        q6,  d13
+.endif
+
+        mov             r1,  #GRAIN_HEIGHT - 3
+1:
+        bl              sum_\type\()_lag1_left_neon  // 8
+        bl              sum_\type\()_lag1_mid_neon   // 16
+        bl              sum_\type\()_lag1_mid_neon   // 24
+        bl              sum_\type\()_lag1_mid_neon   // 32
+        bl              sum_\type\()_lag1_mid_neon   // 40
+        bl              sum_\type\()_lag1_mid_neon   // 48
+        bl              sum_\type\()_lag1_mid_neon   // 56
+        bl              sum_\type\()_lag1_mid_neon   // 64
+        bl              sum_\type\()_lag1_mid_neon   // 72
+        bl              sum_\type\()_lag1_right_neon // 80
+        get_grain_2     d16
+        subs            r1,  r1,  #1
+.ifc \type, uv_444
+        add             r11, r11, #4
+.endif
+        vst1.32         {d16[0]}, [r0]!
+        bgt             1b
+
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+
+L(generate_grain_\type\()_lag2):
+        vpush           {q4-q7}
+        mov             r5,  #128
+        lsl             r5,  r5,  lr        //  128 << bitdepth_min_8
+        sub             r5,  r5,  #1        // (128 << bitdepth_min_8) - 1
+        vld1.8          {d28,d29}, [r4]     // ar_coeffs_y[0-11], ar_coeffs_uv[0-12]
+
+        vmov.s8         r4,  d29[2]
+        vmov.s8         r10, d29[3]
+
+        mov             r1,  #3
+        bl              generate_grain_rows_neon
+
+        mov             r1,  #GRAIN_HEIGHT - 3
+1:
+        bl              sum_\type\()_lag2_left_neon  // 8
+        bl              sum_\type\()_lag2_mid_neon   // 16
+        bl              sum_\type\()_lag2_mid_neon   // 24
+        bl              sum_\type\()_lag2_mid_neon   // 32
+        bl              sum_\type\()_lag2_mid_neon   // 40
+        bl              sum_\type\()_lag2_mid_neon   // 48
+        bl              sum_\type\()_lag2_mid_neon   // 56
+        bl              sum_\type\()_lag2_mid_neon   // 64
+        bl              sum_\type\()_lag2_mid_neon   // 72
+        bl              sum_\type\()_lag2_right_neon // 80
+        get_grain_2     d16
+        subs            r1,  r1,  #1
+.ifc \type, uv_444
+        add             r11, r11, #4
+.endif
+        vst1.32         {d16[0]}, [r0]!
+        bgt             1b
+
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+
+L(generate_grain_\type\()_lag3):
+        vpush           {q4-q7}
+        mov             r5,  #128
+        lsl             r5,  r5,  lr        //  128 << bitdepth_min_8
+        sub             r5,  r5,  #1        // (128 << bitdepth_min_8) - 1
+        vld1.8          {q13, q14}, [r4]    // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
+
+        vmov.u8         r4,  d28[5]
+        vmov.u8         r10, d28[6]
+        vmov.u8         r12, d28[7]
+
+        orr             r4,  r4,  r10, lsl #8
+        orr             r4,  r4,  r12, lsl #16
+
+        mov             r1,  #3
+        vpush           {d26}
+        bl              generate_grain_rows_neon
+        vpop            {d26}
+
+        mov             r1,  #GRAIN_HEIGHT - 3
+1:
+        bl              sum_\type\()_lag3_left_neon  // 8
+        bl              sum_\type\()_lag3_mid_neon   // 16
+        bl              sum_\type\()_lag3_mid_neon   // 24
+        bl              sum_\type\()_lag3_mid_neon   // 32
+        bl              sum_\type\()_lag3_mid_neon   // 40
+        bl              sum_\type\()_lag3_mid_neon   // 48
+        bl              sum_\type\()_lag3_mid_neon   // 56
+        bl              sum_\type\()_lag3_mid_neon   // 64
+        bl              sum_\type\()_lag3_mid_neon   // 72
+        bl              sum_\type\()_lag3_right_neon // 80
+        get_grain_2     d16
+        subs            r1,  r1,  #1
+.ifc \type, uv_444
+        add             r11, r11, #4
+.endif
+        vst1.32         {d16[0]}, [r0]!
+        bgt             1b
+
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+.endm
+
+gen_grain_82 y
+gen_grain_82 uv_444
+
+.macro set_height dst, type
+.ifc \type, uv_420
+        mov             \dst,  #SUB_GRAIN_HEIGHT-3
+.else
+        mov             \dst,  #GRAIN_HEIGHT-3
+.endif
+.endm
+
+.macro increment_y_ptr reg, type
+.ifc \type, uv_420
+        add             \reg, \reg, #2*GRAIN_WIDTH*2-(6*32)
+.else
+        sub             \reg, \reg, #6*32-GRAIN_WIDTH*2
+.endif
+.endm
+
+.macro gen_grain_44 type
+function generate_grain_\type\()_16bpc_neon, export=1
+        push            {r4-r11,lr}
+
+        ldr             r4,  [sp, #36]
+        mov             r12, r3
+        movw            r11, #(3*GRAIN_WIDTH-3)*2
+        mov             lr,  #28
+        add             r11, r1,  r11
+        mov             r1,  r2
+        mul             r12, r12, lr
+        clz             lr,  r4
+
+        movrel          r3,  X(gaussian_sequence)
+        sub             lr,  lr,  #24 // -bitdepth_min_8
+        ldr             r2,  [r1, #FGD_SEED]
+        ldr             r9,  [r1, #FGD_GRAIN_SCALE_SHIFT]
+        add             r4,  r1,  #FGD_AR_COEFFS_UV
+        add             r9,  r9,  lr // grain_scale_shift - bitdepth_min_8
+        adr             r5,  L(gen_grain_\type\()_tbl)
+        ldr             r6,  [r1, #FGD_AR_COEFF_LAG]
+        add             r9,  r9,  #4
+        ldr             r6,  [r5, r6, lsl #2]
+        vdup.16         q15, r9    // 4 - bitdepth_min_8 + data->grain_scale_shift
+        add             r5,  r5,  r6
+        vneg.s16        q15, q15
+
+        push            {lr}
+        cmp             r12, #0
+        movw            r10, #0x49d8
+        movw            lr,  #0xb524
+        // Intentionally using a separate register instead of moveq with an
+        // immediate constant, to avoid armv8 deprecated it instruction forms.
+        it              eq
+        moveq           r10, lr
+        add             r4,  r4,  r12       // Add offset to ar_coeffs_uv[1]
+        eor             r2,  r2,  r10
+        pop             {lr}
+
+        ldr             r7,  [r1, #FGD_AR_COEFF_SHIFT]
+        neg             lr,  lr
+        mov             r8,  #1
+        mov             r10, #1
+        lsl             r8,  r8,  r7        // 1 << ar_coeff_shift
+        lsl             r10, r10, r9        // 1 << (4 + data->grain_scale_shift)
+        lsr             r8,  r8,  #1        // 1 << (ar_coeff_shift - 1)
+        lsr             r10, r10, #1        // 1 << (4 + data->grain_scale_shift - 1)
+        bx              r5
+
+        .align 2
+L(gen_grain_\type\()_tbl):
+        .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+        .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+        .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+        .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+
+L(generate_grain_\type\()_lag0):
+.ifc \type, uv_420
+        vpush           {q4-q5}
+.endif
+        mov             r5,  #128
+        lsl             r5,  r5,  lr        //  128 << bitdepth_min_8
+        sub             r5,  r5,  #1        // (128 << bitdepth_min_8) - 1
+        mvn             r6,  r5             // grain_min = ~grain_max
+
+        mov             r1,  #3
+        bl              generate_grain_rows_44_neon
+        set_height      r1,  \type
+
+        vdup.32         q12, r7
+        vld1.8          {d22[]}, [r4]       // ar_coeffs_uv[0]
+        vmov.i8         q0,  #0
+        vmov.i8         q1,  #255
+        vdup.16         q9,  r5
+        vdup.16         q10, r6
+        vext.8          q13, q0,  q1,  #10
+        vext.8          q14, q1,  q0,  #14
+        vneg.s32        q12, q12
+        vmovl.s8        q11, d22
+
+1:
+        vmov            q1,  q13
+        bl              gen_grain_\type\()_lag0_8_neon // 8
+        vmov.i8         q1,  #255
+        bl              gen_grain_\type\()_lag0_8_neon // 16
+        bl              gen_grain_\type\()_lag0_8_neon // 24
+        bl              gen_grain_\type\()_lag0_8_neon // 32
+        bl              gen_grain_\type\()_lag0_8_neon // 40
+        vmov            q1,  q14
+        bl              gen_grain_\type\()_lag0_4_neon // 44
+        subs            r1,  r1,  #1
+        increment_y_ptr r11, \type
+        add             r0,  r0,  #GRAIN_WIDTH*2-6*16
+        bgt             1b
+
+.ifc \type, uv_420
+        vpop            {q4-q5}
+.endif
+        pop             {r4-r11,pc}
+
+L(generate_grain_\type\()_lag1):
+        vpush           {q4-q7}
+        mov             r5,  #128
+        lsl             r5,  r5,  lr        //  128 << bitdepth_min_8
+        sub             r5,  r5,  #1        // (128 << bitdepth_min_8) - 1
+        vld1.8          {d27[]}, [r4]!      // ar_coeffs_uv[0]
+        vld1.8          {d28[]}, [r4]!      // ar_coeffs_uv[1]
+        vld1.8          {d29[]}, [r4]       // ar_coeffs_uv[2]
+        add             r4,  r4,  #2
+
+        mov             r1,  #3
+        vld1.8          {d13[]}, [r4]       // ar_coeffs_uv[4]
+        ldrsb           r4,  [r4, #-1]      // ar_coeffs_uv[3]
+        bl              generate_grain_rows_44_neon
+        vmovl.s8        q13, d27
+        vmovl.s8        q12, d29
+        vmovl.s8        q14, d28
+        vmov            d29, d24
+        vmovl.s8        q6,  d13
+
+        set_height      r1,  \type
+1:
+        bl              sum_\type\()_lag1_left_neon  // 8
+        bl              sum_\type\()_lag1_mid_neon   // 16
+        bl              sum_\type\()_lag1_mid_neon   // 24
+        bl              sum_\type\()_lag1_mid_neon   // 32
+        bl              sum_\type\()_lag1_mid_neon   // 40
+        bl              sum_\type\()_lag1_right_neon // 44
+        subs            r1,  r1,  #1
+        increment_y_ptr r11, \type
+        add             r0,  r0,  #GRAIN_WIDTH*2-6*16
+        bgt             1b
+
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+
+L(generate_grain_\type\()_lag2):
+        vpush           {q4-q7}
+        mov             r5,  #128
+        lsl             r5,  r5,  lr        //  128 << bitdepth_min_8
+        sub             r5,  r5,  #1        // (128 << bitdepth_min_8) - 1
+        vld1.8          {d28,d29}, [r4]     // ar_coeffs_uv[0-12]
+
+        vmov.s8         r4,  d29[2]
+        vmov.s8         r10, d29[3]
+
+        mov             r1,  #3
+        bl              generate_grain_rows_44_neon
+
+        set_height      r1,  \type
+1:
+        bl              sum_\type\()_lag2_left_neon  // 8
+        bl              sum_\type\()_lag2_mid_neon   // 16
+        bl              sum_\type\()_lag2_mid_neon   // 24
+        bl              sum_\type\()_lag2_mid_neon   // 32
+        bl              sum_\type\()_lag2_mid_neon   // 40
+        bl              sum_\type\()_lag2_right_neon // 44
+        subs            r1,  r1,  #1
+        increment_y_ptr r11, \type
+        add             r0,  r0,  #GRAIN_WIDTH*2-6*16
+        bgt             1b
+
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+
+L(generate_grain_\type\()_lag3):
+        vpush           {q4-q7}
+        mov             r5,  #128
+        lsl             r5,  r5,  lr        //  128 << bitdepth_min_8
+        sub             r5,  r5,  #1        // (128 << bitdepth_min_8) - 1
+        vld1.8          {q13, q14}, [r4]    // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
+
+        vmov.u8         r4,  d28[5]
+        vmov.u8         r10, d28[6]
+        vmov.u8         r12, d28[7]
+
+        orr             r4,  r4,  r10, lsl #8
+        orr             r4,  r4,  r12, lsl #16
+
+        mov             r1,  #3
+        bl              generate_grain_rows_44_neon
+
+        set_height      r1,  \type
+1:
+        bl              sum_\type\()_lag3_left_neon  // 8
+        bl              sum_\type\()_lag3_mid_neon   // 16
+        bl              sum_\type\()_lag3_mid_neon   // 24
+        bl              sum_\type\()_lag3_mid_neon   // 32
+        bl              sum_\type\()_lag3_mid_neon   // 40
+        bl              sum_\type\()_lag3_right_neon // 44
+        subs            r1,  r1,  #1
+        increment_y_ptr r11, \type
+        add             r0,  r0,  #GRAIN_WIDTH*2-6*16
+        bgt             1b
+
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+.endm
+
+gen_grain_44 uv_420
+gen_grain_44 uv_422
+
+.macro gather_interleaved dst1, dst2, src1, src2, src3, src4, off
+        vmov.u16        r11, \src1[0+\off]
+        vmov.u16        r12, \src3[0+\off]
+        add             r11, r11, r3
+        vmov.u16        lr,  \src1[2+\off]
+        add             r12, r12, r3
+        vld1.8          {\dst1[0+\off]}, [r11]
+        vmov.u16        r11, \src3[2+\off]
+        add             lr,  lr,  r3
+        vld1.8          {\dst2[0+\off]}, [r12]
+        vmov.u16        r12, \src2[0+\off]
+        add             r11, r11, r3
+        vld1.8          {\dst1[2+\off]}, [lr]
+        vmov.u16        lr,  \src4[0+\off]
+        add             r12, r12, r3
+        vld1.8          {\dst2[2+\off]}, [r11]
+        vmov.u16        r11, \src2[2+\off]
+        add             lr,  lr,  r3
+        vld1.8          {\dst1[4+\off]}, [r12]
+        vmov.u16        r12, \src4[2+\off]
+        add             r11, r11, r3
+        vld1.8          {\dst2[4+\off]}, [lr]
+        add             r12, r12, r3
+        vld1.8          {\dst1[6+\off]}, [r11]
+        vld1.8          {\dst2[6+\off]}, [r12]
+.endm
+
+.macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8
+        gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 0
+        gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 1
+        gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 0
+        gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 1
+.endm
+
+function gather32_neon
+        push            {r11-r12,lr}
+        gather          d8,  d9,  d10, d11, d0,  d1,  d2,  d3,  d4,  d5,  d6,  d7
+        pop             {r11-r12,pc}
+endfunc
+
+function gather16_neon
+        push            {r11-r12,lr}
+        gather_interleaved d8,  d9,  d0,  d1,  d2,  d3,  0
+        gather_interleaved d8,  d9,  d0,  d1,  d2,  d3,  1
+        pop             {r11-r12,pc}
+endfunc
+
+const overlap_coeffs_0, align=4
+        .short 27, 17, 0,  0
+        .short 17, 27, 32, 32
+endconst
+
+const overlap_coeffs_1, align=4
+        .short 23, 0,  0,  0
+        .short 22, 32, 32, 32
+endconst
+
+.macro calc_offset offx, offy, src, sx, sy
+        and             \offy, \src,  #0xF     // randval & 0xF
+        lsr             \offx, \src,  #4       // randval >> 4
+.if \sy == 0
+        add             \offy, \offy, \offy    // 2 * (randval & 0xF)
+.endif
+.if \sx == 0
+        add             \offx, \offx, \offx    // 2 * (randval >> 4)
+.endif
+.endm
+
+.macro add_offset dst, offx, offy, src, stride
+        mla             \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
+        add             \dst, \dst, \offx, lsl #1  // grain_lut += offx
+.endm
+
+// void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src,
+//                                 const ptrdiff_t stride,
+//                                 const uint8_t scaling[SCALING_SIZE],
+//                                 const int scaling_shift,
+//                                 const entry grain_lut[][GRAIN_WIDTH],
+//                                 const int offsets[][2],
+//                                 const int h, const ptrdiff_t clip,
+//                                 const ptrdiff_t type,
+//                                 const int bitdepth_max);
+function fgy_32x32_16bpc_neon, export=1
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #100]   // scaling_shift, grain_lut
+        ldrd            r6,  r7,  [sp, #108]   // offsets, h
+        ldr             r8,       [sp, #116]   // clip
+        mov             r9,  #GRAIN_WIDTH*2    // grain_lut stride
+        ldr             r10,      [sp, #124]   // bitdepth_max
+
+        eor             r4,  r4,  #15          // 15 - scaling_shift
+        vdup.16         q6,  r10               // bitdepth_max
+        clz             r10, r10
+        vdup.16         q13, r4                // 15 - scaling_shift
+        rsb             r10, r10, #24          // bitdepth_min_8
+        cmp             r8,  #0
+        vdup.16         q12, r10               // bitdepth_min_8
+
+        movrel_local    r12, overlap_coeffs_0
+
+        beq             1f
+        // clip
+        vmov.i16        q14, #16
+        vmov.i16        q15, #235
+        vshl.s16        q14, q14, q12
+        vshl.s16        q15, q15, q12
+        b               2f
+1:
+        // no clip
+        vmov.i16        q14, #0
+        vmov            q15, q6
+2:
+        vshr.u16        q6,  q6,  #1           // grain_max
+
+        vld1.16         {d24, d25}, [r12, :128] // overlap_coeffs
+
+        add             r5,  r5,  #18          // grain_lut += 9
+        add             r5,  r5,  r9,  lsl #3  // grain_lut += 8 * grain_stride
+        add             r5,  r5,  r9           // grain_lut += grain_stride
+
+        ldr             r10, [r6, #8]          // offsets[1][0]
+        calc_offset     r10, r4,  r10, 0,   0
+        add_offset      r4,  r10, r4,  r5,  r9
+        ldr             r10, [r6, #4]          // offsets[0][1]
+        calc_offset     r10, r11, r10, 0,   0
+        add_offset      r11, r10, r11, r5,  r9
+        ldr             r10, [r6, #12]         // offsets[1][1]
+        calc_offset     r10, r8,  r10, 0,   0
+        add_offset      r8,  r10, r8,  r5,  r9
+        ldr             r6,  [r6]              // offsets[0][0]
+        calc_offset     r6,  lr,  r6,  0,   0
+        add_offset      r5,  r6,  lr,  r5,  r9
+
+        add             r4,  r4,  #32*2        // grain_lut += BLOCK_SIZE * bx
+        add             r6,  r11, r9,  lsl #5  // grain_lut += grain_stride * BLOCK_SIZE * by
+
+        ldr             r10, [sp, #120]        // type
+        adr             r11, L(fgy_loop_tbl)
+
+        tst             r10, #1
+        ldr             r10, [r11, r10, lsl #2]
+
+        add             r8,  r8,  r9,  lsl #5  // grain_lut += grain_stride * BLOCK_SIZE * by
+        add             r8,  r8,  #32*2        // grain_lut += BLOCK_SIZE * bx
+
+        add             r11, r11, r10
+
+        beq             1f
+        // y overlap
+        vdup.16         d14, d24[0]
+        vdup.16         d15, d24[1]
+        mov             r10, r7                // backup actual h
+        mov             r7,  #2
+1:
+        sub             r2,  r2,  #32          // src_stride   -= 32
+        sub             r9,  r9,  #32          // grain_stride -= 32
+        bx              r11
+endfunc
+
+function fgy_loop_neon
+L(fgy_loop_tbl):
+        .word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB
+        .word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB
+        .word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB
+        .word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB
+
+.macro fgy ox, oy
+L(loop_\ox\oy):
+1:
+.if \ox
+        vld1.16         {d0},       [r4],       r9 // grain_lut old
+.endif
+.if \oy
+        vld1.16         {q2,  q3},  [r6]!          // grain_lut top
+.endif
+.if \ox && \oy
+        vld1.16         {d2},       [r8],       r9 // grain_lut top old
+.endif
+.if \oy
+        vld1.16         {q4,  q5},  [r6],       r9 // grain_lut top
+.endif
+.if !\ox && !\oy
+        vld1.16         {q0,  q1},  [r1, :128]!    // src
+.endif
+        vld1.16         {q8,  q9},  [r5]!          // grain_lut
+.if !\ox && !\oy
+        vld1.16         {q2,  q3},  [r1, :128], r2 // src
+.endif
+.if !\oy
+        vmvn.i16        q5,  #0xf000               // 0x0fff
+.endif
+        vld1.16         {q10, q11}, [r5],       r9 // grain_lut
+
+.if \ox
+        add             r4,  r4,  #32
+        vmull.s16       q0,  d0,  d24
+        vmlal.s16       q0,  d16, d25
+.endif
+
+.if \oy
+.if \ox
+        add             r8,  r8,  #32
+        vmull.s16       q1,  d2,  d24
+        vmlal.s16       q1,  d4,  d25
+        vqrshrn.s32     d16, q0,  #5
+        vmvn            d0,  d12                   // grain_min
+        vqrshrn.s32     d4,  q1,  #5
+        vmin.s16        d16, d16, d12
+        vmin.s16        d4,  d4,  d12
+        vmax.s16        d16, d16, d0
+        vmax.s16        d4,  d4,  d0
+.endif
+
+        vmull.s16       q0,  d4,  d14
+        vmull.s16       q1,  d5,  d14
+        vmull.s16       q2,  d6,  d14
+        vmull.s16       q3,  d7,  d14
+        vmlal.s16       q0,  d16, d15
+        vmlal.s16       q1,  d17, d15
+        vmlal.s16       q2,  d18, d15
+        vmlal.s16       q3,  d19, d15
+        vmull.s16       q8,  d20, d15
+        vmull.s16       q9,  d21, d15
+        vmull.s16       q10, d22, d15
+        vmull.s16       q11, d23, d15
+        vmlal.s16       q8,  d8,  d14
+        vmlal.s16       q9,  d9,  d14
+        vmlal.s16       q10, d10, d14
+        vmlal.s16       q11, d11, d14
+        vmvn            q4,  q6                   // grain_min
+        vqrshrn.s32     d0,  q0,  #5
+        vqrshrn.s32     d1,  q1,  #5
+        vqrshrn.s32     d2,  q2,  #5
+        vqrshrn.s32     d3,  q3,  #5
+        vqrshrn.s32     d4,  q8,  #5
+        vqrshrn.s32     d5,  q9,  #5
+        vqrshrn.s32     d6,  q10, #5
+        vqrshrn.s32     d7,  q11, #5
+        vmin.s16        q8,  q0,  q6
+        vmin.s16        q9,  q1,  q6
+        vld1.16         {q0,  q1},  [r1, :128]!    // src
+        vmin.s16        q10, q2,  q6
+        vmin.s16        q11, q3,  q6
+        vmax.s16        q8,  q8,  q4
+        vmax.s16        q9,  q9,  q4
+        vld1.16         {q2,  q3},  [r1, :128], r2 // src
+        vmvn.i16        q5,  #0xf000               // 0x0fff
+        vmax.s16        q10, q10, q4
+        vmax.s16        q11, q11, q4
+.elseif \ox
+        vmvn            d4,  d12                   // grain_min
+        vqrshrn.s32     d16, q0,  #5
+        vld1.16         {q0,  q1},  [r1, :128]!    // src
+        vmin.s16        d16, d16, d12
+        vmax.s16        d16, d16, d4
+        vld1.16         {q2,  q3},  [r1, :128], r2 // src
+.endif
+
+        // Make sure that uninitialized pixels out of range past the right
+        // edge are in range; their actual values shouldn't matter.
+        vand            q0,  q0,  q5
+        vand            q1,  q1,  q5
+        vand            q2,  q2,  q5
+        vand            q3,  q3,  q5
+
+        bl              gather32_neon
+
+.if \ox || \oy
+        vpush           {q6-q7}
+.endif
+
+        vmovl.u8        q6,  d8        // scaling
+        vmovl.u8        q7,  d9
+        vmovl.u8        q4,  d10
+        vmovl.u8        q5,  d11
+
+        vshl.u16        q6,  q6,  q13  // scaling << (15 - scaling_shift)
+        vshl.u16        q7,  q7,  q13
+        vshl.u16        q4,  q4,  q13
+        vshl.u16        q5,  q5,  q13
+
+        vqrdmulh.s16    q8,  q8,  q6   // round2((scaling << (15 - scaling_shift) * grain, 15)
+        vqrdmulh.s16    q9,  q9,  q7
+        vqrdmulh.s16    q10, q10, q4
+        vqrdmulh.s16    q11, q11, q5
+
+.if \ox || \oy
+        vpop            {q6-q7}
+.endif
+
+        vqadd.s16       q0,  q0,  q8   // *src + noise
+        vqadd.s16       q1,  q1,  q9
+        vqadd.s16       q2,  q2,  q10
+        vqadd.s16       q3,  q3,  q11
+
+        vmax.s16        q0,  q0,  q14
+        vmax.s16        q1,  q1,  q14
+        vmax.s16        q2,  q2,  q14
+        vmax.s16        q3,  q3,  q14
+        vmin.s16        q0,  q0,  q15
+        vmin.s16        q1,  q1,  q15
+        vmin.s16        q2,  q2,  q15
+        vmin.s16        q3,  q3,  q15
+
+        vst1.16         {q0, q1}, [r0, :128]!    // dst
+        subs            r7,  r7,  #1
+.if \oy
+        vdup.16         d14, d25[0]
+        vdup.16         d15, d25[1]
+.endif
+        vst1.16         {q2, q3}, [r0, :128], r2 // dst
+        bgt             1b
+
+.if \oy
+        cmp             r10, #2
+        sub             r7,  r10, #2           // restore actual remaining h
+        bgt             L(loop_\ox\()0)
+.endif
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+.endm
+
+        fgy             0, 0
+        fgy             0, 1
+        fgy             1, 0
+        fgy             1, 1
+endfunc
+
+// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst,
+//                                      const pixel *const src,
+//                                      const ptrdiff_t stride,
+//                                      const uint8_t scaling[SCALING_SIZE],
+//                                      const Dav1dFilmGrainData *const data,
+//                                      const entry grain_lut[][GRAIN_WIDTH],
+//                                      const pixel *const luma_row,
+//                                      const ptrdiff_t luma_stride,
+//                                      const int offsets[][2],
+//                                      const ptrdiff_t h, const ptrdiff_t uv,
+//                                      const ptrdiff_t is_id,
+//                                      const ptrdiff_t type,
+//                                      const int bitdepth_max);
+.macro fguv layout, sx, sy
+function fguv_32x32_\layout\()_16bpc_neon, export=1
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #100]   // data, grain_lut
+        ldrd            r10, r11, [sp, #124]   // uv, is_id
+        ldr             r6,       [sp, #136]   // bitdepth_max
+
+        clz             r7,  r6
+        rsb             r7,  r7,  #24          // bitdepth_min_8
+
+        // !csfl
+        add             r10, r4,  r10, lsl #2  // + 4*uv
+        add             r12, r10, #FGD_UV_LUMA_MULT
+        add             lr,  r10, #FGD_UV_MULT
+        ldrh            r10, [r10, #FGD_UV_OFFSET] // uv_offset
+        vld1.16         {d30[]},  [r12]        // uv_luma_mult
+        lsl             r10, r10, r7           // uv_offset << bitdepth_min_8
+        vld1.16         {d30[1]}, [lr]         // uv_mult
+
+        ldr             lr,  [r4, #FGD_SCALING_SHIFT]
+        ldr             r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE]
+        eor             lr,  lr,  #15          // 15 - scaling_shift
+
+        vmov.16         d30[2], r10            // uv_offset << bitdepth_min_8
+
+        cmp             r12, #0
+        vdup.16         q13, lr                // 15 - scaling_shift
+
+        beq             1f
+        // clip
+        cmp             r11, #0
+        mov             r8,  #16
+        mov             r9,  #240
+        lsl             r8,  r8,  r7
+        lsl             r9,  r9,  r7
+        beq             2f
+        // is_id
+        mov             r9,  #235
+        lsl             r9,  r9,  r7
+        b               2f
+1:
+        // no clip
+        mov             r8,  #0
+        mov             r9,  r6                // bitdepth_max
+2:
+        vmov.16         d30[3], r6             // bitdepth_max
+        vdup.16         d31, r8                // clip_min
+
+        mov             r10, #GRAIN_WIDTH*2    // grain_lut stride
+
+.if \sy
+        mov             r6,  #23
+        mov             r7,  #22
+.else
+        mov             r6,  #27
+        mov             r7,  #17
+.endif
+        vmov.16         d31[1], r9             // clip_max
+
+        ldrd            r8,  r9,  [sp, #116]   // offsets, h
+
+        add             r5,  r5,  #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6
+.if \sy
+        add             r5,  r5,  r10, lsl #2  // grain_lut += 4 * grain_stride
+        add             r5,  r5,  r10, lsl #1  // grain_lut += 2 * grain_stride
+.else
+        add             r5,  r5,  r10, lsl #3  // grain_lut += 8 * grain_stride
+        add             r5,  r5,  r10          // grain_lut += grain_stride
+.endif
+        vmov.16         d31[2], r6             // overlap y [0]
+
+        ldr             r12, [r8, #8]          // offsets[1][0]
+        calc_offset     r12, r4,  r12, \sx, \sy
+        add_offset      r4,  r12, r4,  r5,  r10
+
+        ldr             r12, [r8, #4]          // offsets[0][1]
+        calc_offset     r12, lr,  r12, \sx, \sy
+        add_offset      lr,  r12, lr,  r5,  r10
+
+        ldr             r12, [r8, #12]         // offsets[1][1]
+        calc_offset     r12, r11, r12, \sx, \sy
+        add_offset      r11, r12, r11, r5,  r10
+
+        ldr             r8,  [r8]              // offsets[0][0]
+        calc_offset     r8,  r12, r8,  \sx, \sy
+        add_offset      r5,  r8,  r12, r5,  r10
+
+        vmov.16         d31[3], r7             // overlap y [1]
+
+        add             r4,  r4,  #2*(32 >> \sx)      // grain_lut += BLOCK_SIZE * bx
+        add             r8,  lr,  r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
+        add             r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
+        add             r11, r11, #2*(32 >> \sx)      // grain_lut += BLOCK_SIZE * bx
+
+        movrel_local    r12, overlap_coeffs_\sx
+        ldr             lr,       [sp, #132]   // type
+        ldrd            r6,  r7,  [sp, #108]   // luma_row, luma_stride
+
+        vld1.16         {d24, d25}, [r12, :128] // overlap_coeffs
+
+        movrel_local    r12, L(fguv_loop_sx\sx\()_tbl)
+#if CONFIG_THUMB
+        // This uses movrel_local instead of adr above, because the target
+        // can be out of range for adr. But movrel_local leaves the thumb bit
+        // set on COFF (but probably wouldn't if building for thumb on ELF),
+        // thus try to clear the bit for robustness.
+        bic             r12, r12, #1
+#endif
+
+        tst             lr,  #1
+        ldr             lr,  [r12, lr,  lsl #2]
+
+        add             r12, r12, lr
+
+        beq             1f
+        // y overlap
+        sub             lr,  r9,  #(2 >> \sy)  // backup remaining h
+        mov             r9,  #(2 >> \sy)
+
+1:
+.if \sy
+        add             r7,  r7,  r7           // luma_stride *= 2
+.endif
+        sub             r7,  r7,  #32          // luma_stride -= 32
+
+        bx              r12
+endfunc
+.endm
+
+fguv 420, 1, 1
+fguv 422, 1, 0
+fguv 444, 0, 0
+
+function fguv_loop_sx0_neon
+L(fguv_loop_sx0_tbl):
+        .word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+        .word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+        .word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+        .word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+        .word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+        .word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+        .word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+        .word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+
+.macro fguv_loop_sx0 csfl, ox, oy
+L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
+        sub             r2,  r2,  #32          // src_stride   -= 32
+        sub             r10, r10, #32          // grain_stride -= 32
+.if \oy
+        mov             r12, lr
+.endif
+L(fguv_loop_sx0_csfl\csfl\()_\ox\oy\()_loopstart):
+1:
+.if \ox
+        vld1.16         {d0},       [r4],       r10 // grain_lut old
+.endif
+.if \oy
+        vld1.16         {q2,  q3},  [r8]!           // grain_lut top
+.endif
+.if \ox && \oy
+        vld1.16         {d2},       [r11],      r10 // grain_lut top old
+.endif
+.if !\ox && !\oy
+        vld1.16         {q0,  q1},  [r6, :128]!     // luma
+.endif
+        vld1.16         {q8,  q9},  [r5]!           // grain_lut
+.if \oy
+        vld1.16         {q4,  q5},  [r8],       r10 // grain_lut top
+.endif
+.if !\ox && !\oy
+        vld1.16         {q2,  q3},  [r6, :128], r7  // luma
+.endif
+.if \oy
+        vdup.16         d28, d31[2]                 // overlap y coeff
+        vdup.16         d29, d31[3]                 // overlap y coeff
+.endif
+        vld1.16         {q10, q11}, [r5],       r10 // grain_lut
+
+.if \ox
+        vdup.16         q7,  d30[3]                // bitdepth_max
+        add             r4,  r4,  #32
+        vmull.s16       q0,  d0,  d24
+        vshr.u16        q7,  q7,  #1               // grain_max
+        vmlal.s16       q0,  d16, d25
+        vmvn            q6,  q7                    // grain_min
+.endif
+
+.if \oy
+.if \ox
+        add             r11, r11, #32
+        vmull.s16       q1,  d2,  d24
+        vmlal.s16       q1,  d4,  d25
+        vqrshrn.s32     d16, q0,  #5
+        vqrshrn.s32     d4,  q1,  #5
+        vmin.s16        d4,  d4,  d14
+        vmin.s16        d16, d16, d14
+        vmax.s16        d4,  d4,  d12
+        vmax.s16        d16, d16, d12
+.endif
+
+        vmull.s16       q0,  d4,  d28
+        vmull.s16       q1,  d5,  d28
+        vmull.s16       q2,  d6,  d28
+        vmull.s16       q3,  d7,  d28
+.if !\ox
+        vdup.16         q7,  d30[3]                // bitdepth_max
+.endif
+        vmlal.s16       q0,  d16, d29
+        vmlal.s16       q1,  d17, d29
+        vmlal.s16       q2,  d18, d29
+        vmlal.s16       q3,  d19, d29
+.if !\ox
+        vshr.u16        q7,  q7,  #1               // grain_max
+.endif
+        vmull.s16       q8,  d20, d29
+        vmull.s16       q9,  d21, d29
+        vmull.s16       q10, d22, d29
+        vmull.s16       q11, d23, d29
+.if !\ox
+        vmvn            q6,  q7                    // grain_min
+.endif
+        vmlal.s16       q8,  d8,  d28
+        vmlal.s16       q9,  d9,  d28
+        vmlal.s16       q10, d10, d28
+        vmlal.s16       q11, d11, d28
+        vqrshrn.s32     d0,  q0,  #5
+        vqrshrn.s32     d1,  q1,  #5
+        vqrshrn.s32     d2,  q2,  #5
+        vqrshrn.s32     d3,  q3,  #5
+        vqrshrn.s32     d4,  q8,  #5
+        vqrshrn.s32     d5,  q9,  #5
+        vqrshrn.s32     d6,  q10, #5
+        vqrshrn.s32     d7,  q11, #5
+        vmin.s16        q8,  q0,  q7
+        vmin.s16        q9,  q1,  q7
+        vld1.16         {q0,  q1},  [r6, :128]!    // luma
+        vmin.s16        q10, q2,  q7
+        vmin.s16        q11, q3,  q7
+        vmax.s16        q8,  q8,  q6
+        vmax.s16        q9,  q9,  q6
+        vld1.16         {q2,  q3},  [r6, :128], r7 // luma
+        vmax.s16        q10, q10, q6
+        vmax.s16        q11, q11, q6
+.elseif \ox
+        vqrshrn.s32     d16, q0,  #5
+        vld1.16         {q0,  q1},  [r6, :128]!    // luma
+        vmin.s16        d16, d16, d14
+        vld1.16         {q2,  q3},  [r6, :128], r7 // luma
+        vmax.s16        d16, d16, d12
+.endif
+
+.if !\csfl
+        vdup.16         d28, d30[0]   // uv_luma_mult
+        vld1.16         {q4,  q5},  [r1, :128]! // src
+        vdup.16         d29, d30[1]   // uv_mult
+        vmull.s16       q6,  d0,  d28
+        vmull.s16       q7,  d1,  d28
+        vmull.s16       q0,  d2,  d28
+        vmull.s16       q1,  d3,  d28
+        vmlal.s16       q6,  d8,  d29
+        vmlal.s16       q7,  d9,  d29
+        vmlal.s16       q0,  d10, d29
+        vmlal.s16       q1,  d11, d29
+        vld1.16         {q4,  q5},  [r1, :128]  // src
+        sub             r1,  r1,  #32
+        vshrn.s32       d12, q6,  #6
+        vshrn.s32       d13, q7,  #6
+        vshrn.s32       d14, q0,  #6
+        vshrn.s32       d15, q1,  #6
+        vmull.s16       q0,  d4,  d28
+        vmull.s16       q1,  d5,  d28
+        vmull.s16       q2,  d6,  d28
+        vmull.s16       q3,  d7,  d28
+        vmlal.s16       q0,  d8,  d29
+        vmlal.s16       q1,  d9,  d29
+        vmlal.s16       q2,  d10, d29
+        vmlal.s16       q3,  d11, d29
+        vdup.16         q14, d30[2]   // uv_offset
+        vshrn.s32       d0,  q0,  #6
+        vshrn.s32       d1,  q1,  #6
+        vshrn.s32       d2,  q2,  #6
+        vshrn.s32       d3,  q3,  #6
+        vdup.16         q4,  d30[3]   // bitdepth_max
+        vmov.i16        q5,  #0
+        vadd.i16        q6,  q6,  q14
+        vadd.i16        q7,  q7,  q14
+        vadd.i16        q2,  q0,  q14
+        vadd.i16        q3,  q1,  q14
+        vmin.s16        q0,  q6,  q4
+        vmin.s16        q1,  q7,  q4
+        vmin.s16        q2,  q2,  q4
+        vmin.s16        q3,  q3,  q4
+        vmax.s16        q0,  q0,  q5
+        vmax.s16        q1,  q1,  q5
+        vmax.s16        q2,  q2,  q5
+        vmax.s16        q3,  q3,  q5
+.else
+        vdup.16         q14, d30[3]  // bitdepth_max
+        // Make sure that uninitialized pixels out of range past the right
+        // edge are in range; their actual values shouldn't matter.
+        vand            q0,  q0,  q14
+        vand            q1,  q1,  q14
+        vand            q2,  q2,  q14
+        vand            q3,  q3,  q14
+.endif
+
+        bl              gather32_neon
+
+        vld1.16         {q0,  q1},  [r1, :128]!    // src
+
+        vmovl.u8        q6,  d8        // scaling
+        vmovl.u8        q7,  d9
+        vmovl.u8        q4,  d10
+        vmovl.u8        q5,  d11
+
+        vld1.16         {q2,  q3},  [r1, :128], r2 // src
+
+        vshl.u16        q6,  q6,  q13  // scaling << (15 - scaling_shift)
+        vshl.u16        q7,  q7,  q13
+        vshl.u16        q4,  q4,  q13
+        vshl.u16        q5,  q5,  q13
+
+        vqrdmulh.s16    q8,  q8,  q6   // round2((scaling << (15 - scaling_shift) * grain, 15)
+        vqrdmulh.s16    q9,  q9,  q7
+        vqrdmulh.s16    q10, q10, q4
+        vqrdmulh.s16    q11, q11, q5
+
+
+        vdup.16         q4,  d31[0]    // clip_min
+        vdup.16         q5,  d31[1]    // clip_max
+
+        vqadd.s16       q0,  q0,  q8   // *src + noise
+        vqadd.s16       q1,  q1,  q9
+        vqadd.s16       q2,  q2,  q10
+        vqadd.s16       q3,  q3,  q11
+
+.if \oy
+        vmov.32         lr,  d25[0] // 2 first 16 bit coeffs from overlap x
+.endif
+
+        vmax.s16        q0,  q0,  q4
+        vmax.s16        q1,  q1,  q4
+        vmax.s16        q2,  q2,  q4
+        vmax.s16        q3,  q3,  q4
+        vmin.s16        q0,  q0,  q5
+        vmin.s16        q1,  q1,  q5
+        vmin.s16        q2,  q2,  q5
+        vmin.s16        q3,  q3,  q5
+
+        vst1.16         {q0, q1}, [r0, :128]! // dst
+
+        subs            r9,  r9,  #1
+.if \oy
+        vmov.32         d31[1], lr  // new coeffs for overlap y
+.endif
+
+        vst1.16         {q2, q3}, [r0, :128], r2 // dst
+        bgt             1b
+
+.if \oy
+        cmp             r12, #0
+        mov             r9,  r12               // restore actual remaining h
+        bgt             L(fguv_loop_sx0_csfl\csfl\()_\ox\()0_loopstart)
+.endif
+        b               9f
+.endm
+        fguv_loop_sx0   0, 0, 0
+        fguv_loop_sx0   0, 0, 1
+        fguv_loop_sx0   0, 1, 0
+        fguv_loop_sx0   0, 1, 1
+        fguv_loop_sx0   1, 0, 0
+        fguv_loop_sx0   1, 0, 1
+        fguv_loop_sx0   1, 1, 0
+        fguv_loop_sx0   1, 1, 1
+
+9:
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+
+function fguv_loop_sx1_neon
+L(fguv_loop_sx1_tbl):
+        .word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+        .word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+        .word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+        .word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+        .word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+        .word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+        .word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+        .word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+
+.macro fguv_loop_sx1 csfl, ox, oy
+L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
+.if \oy
+        mov             r12, lr
+.endif
+1:
+.if \ox
+        vld1.16         {d0},       [r4],       r10 // grain_lut old
+.endif
+.if \ox && \oy
+        vld1.16         {d2},       [r11],      r10 // grain_lut top old
+.endif
+.if \oy
+        vld1.16         {q2,  q3},  [r8],       r10 // grain_lut top
+.endif
+.if !\ox && !\oy
+        vld1.16         {q0,  q1},  [r6, :128]!     // luma
+.endif
+        vld1.16         {q8,  q9},  [r5],       r10 // grain_lut
+.if \oy
+        vdup.16         d28, d31[2]                 // overlap y coeff
+        vdup.16         d29, d31[3]                 // overlap y coeff
+.endif
+.if !\ox && !\oy
+        vld1.16         {q2,  q3},  [r6, :128], r7  // luma
+.endif
+
+.if \ox
+        vdup.16         q7,  d30[3]                // bitdepth_max
+        vmull.s16       q0,  d0,  d24
+        vshr.u16        q7,  q7,  #1               // grain_max
+        vmlal.s16       q0,  d16, d25
+        vmvn            q6,  q7                    // grain_min
+.endif
+
+.if \oy
+.if \ox
+        vmull.s16       q1,  d2,  d24
+        vmlal.s16       q1,  d4,  d25
+        vqrshrn.s32     d16, q0,  #5
+        vqrshrn.s32     d4,  q1,  #5
+        vmin.s16        d4,  d4,  d14
+        vmin.s16        d16, d16, d14
+        vmax.s16        d4,  d4,  d12
+        vmax.s16        d16, d16, d12
+.endif
+
+        vmull.s16       q0,  d4,  d28
+        vmull.s16       q1,  d5,  d28
+        vmull.s16       q2,  d6,  d28
+        vmull.s16       q3,  d7,  d28
+.if !\ox
+        vdup.16         q7,  d30[3]                // bitdepth_max
+.endif
+        vmlal.s16       q0,  d16, d29
+        vmlal.s16       q1,  d17, d29
+        vmlal.s16       q2,  d18, d29
+        vmlal.s16       q3,  d19, d29
+.if !\ox
+        vshr.u16        q7,  q7,  #1               // grain_max
+.endif
+        vqrshrn.s32     d16, q0,  #5
+        vqrshrn.s32     d17, q1,  #5
+        vqrshrn.s32     d18, q2,  #5
+        vqrshrn.s32     d19, q3,  #5
+.if !\ox
+        vmvn            q6,  q7                    // grain_min
+.endif
+        vld1.16         {q0,  q1},  [r6, :128]!    // luma
+        vmin.s16        q8,  q8,  q7
+        vmin.s16        q9,  q9,  q7
+        vmax.s16        q8,  q8,  q6
+        vmax.s16        q9,  q9,  q6
+        vld1.16         {q2,  q3},  [r6, :128], r7 // luma
+.elseif \ox
+        vqrshrn.s32     d16, q0,  #5
+        vld1.16         {q0,  q1},  [r6, :128]!    // luma
+        vmin.s16        d16, d16, d14
+        vld1.16         {q2,  q3},  [r6, :128], r7 // luma
+        vmax.s16        d16, d16, d12
+.endif
+
+        vpadd.i16       d0,  d0,  d1
+        vpadd.i16       d1,  d2,  d3
+        vpadd.i16       d2,  d4,  d5
+        vpadd.i16       d3,  d6,  d7
+        vrshr.u16       q0,  q0,  #1
+        vrshr.u16       q1,  q1,  #1
+.if !\csfl
+        vdup.16         d28, d30[0]   // uv_luma_mult
+        vld1.16         {q2,  q3},  [r1, :128], r2 // src
+        vdup.16         d29, d30[1]   // uv_mult
+        vmull.s16       q6,  d0,  d28
+        vmull.s16       q7,  d1,  d28
+        vmull.s16       q0,  d2,  d28
+        vmull.s16       q1,  d3,  d28
+        vmlal.s16       q6,  d4,  d29
+        vmlal.s16       q7,  d5,  d29
+        vmlal.s16       q0,  d6,  d29
+        vmlal.s16       q1,  d7,  d29
+        vshrn.s32       d12, q6,  #6
+        vshrn.s32       d13, q7,  #6
+        vshrn.s32       d14, q0,  #6
+        vshrn.s32       d15, q1,  #6
+        vdup.16         q14, d30[2]   // uv_offset
+        vdup.16         q4,  d30[3]   // bitdepth_max
+        vmov.i16        q5,  #0
+        vadd.i16        q6,  q6,  q14
+        vadd.i16        q7,  q7,  q14
+        vmin.s16        q0,  q6,  q4
+        vmin.s16        q1,  q7,  q4
+        vmax.s16        q0,  q0,  q5
+        vmax.s16        q1,  q1,  q5
+.else
+        vdup.16         q14, d30[3]  // bitdepth_max
+        vld1.16         {q2,  q3},  [r1, :128], r2 // src
+
+        // Make sure that uninitialized pixels out of range past the right
+        // edge are in range; their actual values shouldn't matter.
+        vand            q0,  q0,  q14
+        vand            q1,  q1,  q14
+.endif
+
+        bl              gather16_neon
+
+        vmovl.u8        q6,  d8        // scaling
+        vmovl.u8        q7,  d9
+
+        vshl.u16        q6,  q6,  q13  // scaling << (15 - scaling_shift)
+        vshl.u16        q7,  q7,  q13
+
+        vqrdmulh.s16    q8,  q8,  q6   // round2((scaling << (15 - scaling_shift) * grain, 15)
+        vqrdmulh.s16    q9,  q9,  q7
+
+
+        vdup.16         q4,  d31[0]    // clip_min
+        vdup.16         q5,  d31[1]    // clip_max
+
+        vqadd.s16       q0,  q2,  q8   // *src + noise
+        vqadd.s16       q1,  q3,  q9
+
+.if \oy
+        // Swap the two last coefficients of d31, place them first in d28
+        vrev64.16       d28, d31
+.endif
+
+        vmax.s16        q0,  q0,  q4
+        vmax.s16        q1,  q1,  q4
+        vmin.s16        q0,  q0,  q5
+        vmin.s16        q1,  q1,  q5
+
+        subs            r9,  r9,  #1
+.if \oy
+        // Take the first two 16 bit coefficients of d28 and place them at the
+        // end of d31
+        vtrn.32         d31, d28
+.endif
+
+        vst1.16         {q0, q1}, [r0, :128], r2 // dst
+        bgt             1b
+
+.if \oy
+        cmp             r12, #0
+        mov             r9,  r12               // restore actual remaining h
+        bgt             L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
+.endif
+
+        b               9f
+.endm
+        fguv_loop_sx1   0, 0, 0
+        fguv_loop_sx1   0, 0, 1
+        fguv_loop_sx1   0, 1, 0
+        fguv_loop_sx1   0, 1, 1
+        fguv_loop_sx1   1, 0, 0
+        fguv_loop_sx1   1, 0, 1
+        fguv_loop_sx1   1, 1, 0
+        fguv_loop_sx1   1, 1, 1
+
+9:
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
diff --git a/third_party/dav1d/src/arm/32/itx.S b/third_party/dav1d/src/arm/32/itx.S
index a1aea4139baf5..ceea025e45ce9 100644
--- a/third_party/dav1d/src/arm/32/itx.S
+++ b/third_party/dav1d/src/arm/32/itx.S
@@ -134,9 +134,9 @@ endconst
         vmlsl.s16       \d1, \s3, \c1
 .endm
 
-.macro vrshrn_8h d0, d1, s0, s1, shift
-        vrshrn.i32      \d0, \s0, \shift
-        vrshrn.i32      \d1, \s1, \shift
+.macro vqrshrn_8h d0, d1, s0, s1, shift
+        vqrshrn.s32     \d0, \s0, \shift
+        vqrshrn.s32     \d1, \s1, \shift
 .endm
 
 .macro scale_input c, r0, r1, r2 r3, r4, r5, r6, r7
@@ -418,11 +418,11 @@ endfunc
         vmull_vmlal     q3,  \r1, \r3, d0[3], d0[2]
         vmull_vmlsl     q2,  \r1, \r3, d0[2], d0[3]
         vmull_vmlal     q1,  \r0, \r2, d0[0], d0[0]
-        vrshrn.i32      d6,  q3,  #12
-        vrshrn.i32      d7,  q2,  #12
+        vqrshrn.s32     d6,  q3,  #12
+        vqrshrn.s32     d7,  q2,  #12
         vmull_vmlsl     q2,  \r0, \r2, d0[0], d0[0]
-        vrshrn.i32      d2,  q1,  #12
-        vrshrn.i32      d3,  q2,  #12
+        vqrshrn.s32     d2,  q1,  #12
+        vqrshrn.s32     d3,  q2,  #12
         vqadd.s16       \r0, d2,  d6
         vqsub.s16       \r3, d2,  d6
         vqadd.s16       \r1, d3,  d7
@@ -433,11 +433,11 @@ endfunc
         vmull_vmlal_8h  q6,  q7,  \r2, \r3, \r6, \r7, d0[3], d0[2]
         vmull_vmlsl_8h  q4,  q5,  \r2, \r3, \r6, \r7, d0[2], d0[3]
         vmull_vmlal_8h  q2,  q3,  \r0, \r1, \r4, \r5, d0[0], d0[0]
-        vrshrn_8h       d12, d13, q6,  q7,  #12
-        vrshrn_8h       d14, d15, q4,  q5,  #12
+        vqrshrn_8h      d12, d13, q6,  q7,  #12
+        vqrshrn_8h      d14, d15, q4,  q5,  #12
         vmull_vmlsl_8h  q4,  q5,  \r0, \r1, \r4, \r5, d0[0], d0[0]
-        vrshrn_8h       d4,  d5,  q2,  q3,  #12
-        vrshrn_8h       d6,  d7,  q4,  q5,  #12
+        vqrshrn_8h      d4,  d5,  q2,  q3,  #12
+        vqrshrn_8h      d6,  d7,  q4,  q5,  #12
         vqadd.s16       \q0, q2,  q6
         vqsub.s16       \q3, q2,  q6
         vqadd.s16       \q1, q3,  q7
@@ -478,10 +478,10 @@ endfunc
         vadd.s32        q3,  q3,  q10
         vsub.s32        q11, q11, q10
 
-        vrshrn.i32      \o0, q2,  #12
-        vrshrn.i32      \o2, q1,  #12
-        vrshrn.i32      \o1, q3,  #12
-        vrshrn.i32      \o3, q11, #12
+        vqrshrn.s32     \o0, q2,  #12
+        vqrshrn.s32     \o2, q1,  #12
+        vqrshrn.s32     \o1, q3,  #12
+        vqrshrn.s32     \o3, q11, #12
 .endm
 
 function inv_adst_4h_x4_neon, export=1
@@ -533,21 +533,21 @@ endfunc
         vsub.s32        q4,  q4,  q2 // out3
         vsub.s32        q5,  q5,  q3
 
-        vrshrn.i32      d20, q10, #12
-        vrshrn.i32      d21, q11, #12
+        vqrshrn.s32     d20, q10, #12
+        vqrshrn.s32     d21, q11, #12
 
-        vrshrn.i32      \o0, q8,  #12
-        vrshrn.i32      \o1, q9,  #12
+        vqrshrn.s32     \o0, q8,  #12
+        vqrshrn.s32     \o1, q9,  #12
 
 .ifc \o4, d18
         vmov            q9,  q10
 .endif
 
-        vrshrn.i32      \o2, q6,  #12
-        vrshrn.i32      \o3, q7,  #12
+        vqrshrn.s32     \o2, q6,  #12
+        vqrshrn.s32     \o3, q7,  #12
 
-        vrshrn.i32      \o6, q4,  #12
-        vrshrn.i32      \o7, q5,  #12
+        vqrshrn.s32     \o6, q4,  #12
+        vqrshrn.s32     \o7, q5,  #12
 .endm
 
 function inv_adst_8h_x4_neon, export=1
@@ -702,11 +702,11 @@ def_fn_4x4 identity, flipadst
         vmull_vmlsl_8h  q2,   q3,   \r2,  \r3,  \r14, \r15, d1[0], d1[1] // -> t4a
         vmull_vmlal_8h  q4,   q5,   \r2,  \r3,  \r14, \r15, d1[1], d1[0] // -> t7a
         vmull_vmlsl_8h  q6,   q7,   \r10, \r11, \r6,  \r7,  d1[2], d1[3] // -> t5a
-        vrshrn_8h       \r2,  \r3,  q2,   q3,   #12         // t4a
-        vrshrn_8h       \r14, \r15, q4,   q5,   #12         // t7a
+        vqrshrn_8h      \r2,  \r3,  q2,   q3,   #12         // t4a
+        vqrshrn_8h      \r14, \r15, q4,   q5,   #12         // t7a
         vmull_vmlal_8h  q2,   q3,   \r10, \r11, \r6,  \r7,  d1[3], d1[2] // -> t6a
-        vrshrn_8h       \r6,  \r7,  q6,   q7,   #12         // t5a
-        vrshrn_8h       \r10, \r11, q2,   q3,   #12         // t6a
+        vqrshrn_8h      \r6,  \r7,  q6,   q7,   #12         // t5a
+        vqrshrn_8h      \r10, \r11, q2,   q3,   #12         // t6a
 
         vqadd.s16       q2,   \q1,  \q3 // t4
         vqsub.s16       \q1,  \q1,  \q3 // t5a
@@ -715,8 +715,8 @@ def_fn_4x4 identity, flipadst
 
         vmull_vmlsl_8h  q4,   q5,   \r6,  \r7,  \r2,  \r3,  d0[0], d0[0] // -> t5
         vmull_vmlal_8h  q6,   q7,   \r6,  \r7,  \r2,  \r3,  d0[0], d0[0] // -> t6
-        vrshrn_8h       d8,   d9,   q4,   q5,  #12 // t5
-        vrshrn_8h       d10,  d11,  q6,   q7,  #12 // t6
+        vqrshrn_8h      d8,   d9,   q4,   q5,  #12 // t5
+        vqrshrn_8h      d10,  d11,  q6,   q7,  #12 // t6
 
         vqsub.s16       \q7,  \q0,  q3 // out7
         vqadd.s16       \q0,  \q0,  q3 // out0
@@ -735,11 +735,11 @@ def_fn_4x4 identity, flipadst
         vmull_vmlsl     q1,   \r1,  \r7, d1[0], d1[1] // -> t4a
         vmull_vmlal     q2,   \r1,  \r7, d1[1], d1[0] // -> t7a
         vmull_vmlsl     q3,   \r5,  \r3, d1[2], d1[3] // -> t5a
-        vrshrn.i32      \r1,  q1,   #12               // t4a
+        vqrshrn.s32     \r1,  q1,   #12               // t4a
         vmull_vmlal     q1,   \r5,  \r3, d1[3], d1[2] // -> t6a
-        vrshrn.i32      \r7,  q2,   #12               // t7a
-        vrshrn.i32      \r3,  q3,   #12               // t5a
-        vrshrn.i32      \r5,  q1,   #12               // taa
+        vqrshrn.s32     \r7,  q2,   #12               // t7a
+        vqrshrn.s32     \r3,  q3,   #12               // t5a
+        vqrshrn.s32     \r5,  q1,   #12               // taa
 
         vqadd.s16       d2,   \r1,  \r3 // t4
         vqsub.s16       \r1,  \r1,  \r3 // t5a
@@ -748,8 +748,8 @@ def_fn_4x4 identity, flipadst
 
         vmull_vmlsl     q2,   \r3,  \r1, d0[0], d0[0] // -> t5
         vmull_vmlal     q3,   \r3,  \r1, d0[0], d0[0] // -> t6
-        vrshrn.i32      d4,   q2,   #12               // t5
-        vrshrn.i32      d5,   q3,   #12               // t6
+        vqrshrn.s32     d4,   q2,   #12               // t5
+        vqrshrn.s32     d5,   q3,   #12               // t6
 
         vqsub.s16       \r7,  \r0,  d3 // out7
         vqadd.s16       \r0,  \r0,  d3 // out0
@@ -783,19 +783,19 @@ endfunc
         vmull_vmlal_8h  q2,  q3,  d30, d31, d16, d17, d0[0], d0[1]
         vmull_vmlsl_8h  q4,  q5,  d30, d31, d16, d17, d0[1], d0[0]
         vmull_vmlal_8h  q6,  q7,  d26, d27, d20, d21, d0[2], d0[3]
-        vrshrn_8h       d16, d17, q2,  q3,  #12  // t0a
-        vrshrn_8h       d30, d31, q4,  q5,  #12  // t1a
+        vqrshrn_8h      d16, d17, q2,  q3,  #12  // t0a
+        vqrshrn_8h      d30, d31, q4,  q5,  #12  // t1a
         vmull_vmlsl_8h  q2,  q3,  d26, d27, d20, d21, d0[3], d0[2]
         vmull_vmlal_8h  q4,  q5,  d22, d23, d24, d25, d1[0], d1[1]
-        vrshrn_8h       d20, d21, q6,  q7,  #12  // t2a
-        vrshrn_8h       d26, d27, q2,  q3,  #12  // t3a
+        vqrshrn_8h      d20, d21, q6,  q7,  #12  // t2a
+        vqrshrn_8h      d26, d27, q2,  q3,  #12  // t3a
         vmull_vmlsl_8h  q6,  q7,  d22, d23, d24, d25, d1[1], d1[0]
         vmull_vmlal_8h  q2,  q3,  d18, d19, d28, d29, d1[2], d1[3]
-        vrshrn_8h       d24, d25, q4,  q5,  #12  // t4a
-        vrshrn_8h       d22, d23, q6,  q7,  #12  // t5a
+        vqrshrn_8h      d24, d25, q4,  q5,  #12  // t4a
+        vqrshrn_8h      d22, d23, q6,  q7,  #12  // t5a
         vmull_vmlsl_8h  q4,  q5,  d18, d19, d28, d29, d1[3], d1[2]
-        vrshrn_8h       d28, d29, q2,  q3,  #12  // t6a
-        vrshrn_8h       d18, d19, q4,  q5,  #12  // t7a
+        vqrshrn_8h      d28, d29, q2,  q3,  #12  // t6a
+        vqrshrn_8h      d18, d19, q4,  q5,  #12  // t7a
 
         vqadd.s16       q2,  q8,  q12 // t0
         vqsub.s16       q3,  q8,  q12 // t4
@@ -810,13 +810,13 @@ endfunc
         vmull_vmlsl_8h  q12, q13, d6,  d7,  d10, d11, d2[2], d2[3]
         vmull_vmlsl_8h  q14, q15, d22, d23, d14, d15, d2[3], d2[2]
 
-        vrshrn_8h       d6,  d7,  q8,  q9,  #12  // t4a
-        vrshrn_8h       d10, d11, q12, q13, #12  // t5a
+        vqrshrn_8h      d6,  d7,  q8,  q9,  #12  // t4a
+        vqrshrn_8h      d10, d11, q12, q13, #12  // t5a
 
         vmull_vmlal_8h  q8,  q9,  d22, d23, d14, d15, d2[2], d2[3]
 
-        vrshrn_8h       d14, d15, q14, q15, #12  // t6a
-        vrshrn_8h       d22, d23, q8,  q9,  #12  // t7a
+        vqrshrn_8h      d14, d15, q14, q15, #12  // t6a
+        vqrshrn_8h      d22, d23, q8,  q9,  #12  // t7a
 
         vqadd.s16       \q0, q2,  q6  // out0
         vqsub.s16       q2,  q2,  q6  // t2
@@ -833,11 +833,11 @@ endfunc
         vmull_vmlal_8h  q10, q11, d4,  d5,  d8,  d9,  d2[0], d2[0] // -> out3 (q11 or q12)
         vmull_vmlsl_8h  q6,  q7,  d4,  d5,  d8,  d9,  d2[0], d2[0] // -> out4 (q12 or q11)
         vmull_vmlsl_8h  q12, q13, d6,  d7,  d10, d11, d2[0], d2[0] // -> out5 (q13 or q10)
-        vrshrn_8h       d4,  d5,  q10, q11, #12 // out3
+        vqrshrn_8h      d4,  d5,  q10, q11, #12 // out3
         vmull_vmlal_8h  q10, q11, d6,  d7,  d10, d11, d2[0], d2[0] // -> out2 (q10 or q13)
-        vrshrn_8h       d6,  d7,  q12, q13, #12 // out5
-        vrshrn_8h       \r4, \r5, q10, q11, #12 // out2 (q10 or q13)
-        vrshrn_8h       \r8, \r9, q6,  q7,  #12 // out4 (q12 or q11)
+        vqrshrn_8h      d6,  d7,  q12, q13, #12 // out5
+        vqrshrn_8h      \r4, \r5, q10, q11, #12 // out2 (q10 or q13)
+        vqrshrn_8h      \r8, \r9, q6,  q7,  #12 // out4 (q12 or q11)
 
         vqneg.s16       \q3, q2     // out3
         vqneg.s16       \q5, q3     // out5
@@ -850,19 +850,19 @@ endfunc
         vmull_vmlal     q2,  d23, d16, d0[0], d0[1]
         vmull_vmlsl     q3,  d23, d16, d0[1], d0[0]
         vmull_vmlal     q4,  d21, d18, d0[2], d0[3]
-        vrshrn.i32      d16, q2,  #12 // t0a
-        vrshrn.i32      d23, q3,  #12 // t1a
+        vqrshrn.s32     d16, q2,  #12 // t0a
+        vqrshrn.s32     d23, q3,  #12 // t1a
         vmull_vmlsl     q5,  d21, d18, d0[3], d0[2]
         vmull_vmlal     q6,  d19, d20, d1[0], d1[1]
-        vrshrn.i32      d18, q4,  #12 // t2a
-        vrshrn.i32      d21, q5,  #12 // t3a
+        vqrshrn.s32     d18, q4,  #12 // t2a
+        vqrshrn.s32     d21, q5,  #12 // t3a
         vmull_vmlsl     q7,  d19, d20, d1[1], d1[0]
         vmull_vmlal     q2,  d17, d22, d1[2], d1[3]
-        vrshrn.i32      d20, q6,  #12 // t4a
-        vrshrn.i32      d19, q7,  #12 // t5a
+        vqrshrn.s32     d20, q6,  #12 // t4a
+        vqrshrn.s32     d19, q7,  #12 // t5a
         vmull_vmlsl     q3,  d17, d22, d1[3], d1[2]
-        vrshrn.i32      d22, q2,  #12 // t6a
-        vrshrn.i32      d17, q3,  #12 // t7a
+        vqrshrn.s32     d22, q2,  #12 // t6a
+        vqrshrn.s32     d17, q3,  #12 // t7a
 
         vqadd.s16       d4,  d16, d20 // t0
         vqsub.s16       d5,  d16, d20 // t4
@@ -877,13 +877,13 @@ endfunc
         vmull_vmlsl     q10, d5,  d7,  d2[2], d2[3]
         vmull_vmlsl     q11, d19, d9,  d2[3], d2[2]
 
-        vrshrn.i32      d5,  q8,  #12 // t4a
-        vrshrn.i32      d7,  q10, #12 // t5a
+        vqrshrn.s32     d5,  q8,  #12 // t4a
+        vqrshrn.s32     d7,  q10, #12 // t5a
 
         vmull_vmlal     q8,  d19, d9,  d2[2], d2[3]
 
-        vrshrn.i32      d9,  q11, #12 // t6a
-        vrshrn.i32      d19, q8,  #12 // t7a
+        vqrshrn.s32     d9,  q11, #12 // t6a
+        vqrshrn.s32     d19, q8,  #12 // t7a
 
         vqadd.s16       \r0, d4,  d8  // out0
         vqsub.s16       d4,  d4,  d8  // t2
@@ -900,11 +900,11 @@ endfunc
         vmull_vmlal     q9,  d4,  d6,  d2[0], d2[0] // -> out3 (d19 or d20)
         vmull_vmlsl     q4,  d4,  d6,  d2[0], d2[0] // -> out4 (d20 or d19)
         vmull_vmlsl     q10, d5,  d7,  d2[0], d2[0] // -> out5 (d21 or d18)
-        vrshrn.i32      d4,  q9,  #12 // out3
+        vqrshrn.s32     d4,  q9,  #12 // out3
         vmull_vmlal     q9,  d5,  d7,  d2[0], d2[0] // -> out2 (d18 or d21)
-        vrshrn.i32      d5,  q10, #12 // out5
-        vrshrn.i32      \r2, q9,  #12 // out2 (d18 or d21)
-        vrshrn.i32      \r4, q4,  #12 // out4 (d20 or d19)
+        vqrshrn.s32     d5,  q10, #12 // out5
+        vqrshrn.s32     \r2, q9,  #12 // out2 (d18 or d21)
+        vqrshrn.s32     \r4, q4,  #12 // out4 (d20 or d19)
 
         vqneg.s16       \r3, d4       // out3
         vqneg.s16       \r5, d5       // out5
@@ -1122,19 +1122,19 @@ function inv_dct_4h_x16_neon, export=1
         vmull_vmlsl     q2,  d17, d31, d2[0], d2[1]  // -> t8a
         vmull_vmlal     q3,  d17, d31, d2[1], d2[0]  // -> t15a
         vmull_vmlsl     q4,  d25, d23, d2[2], d2[3]  // -> t9a
-        vrshrn.i32      d17, q2,  #12                // t8a
-        vrshrn.i32      d31, q3,  #12                // t15a
+        vqrshrn.s32     d17, q2,  #12                // t8a
+        vqrshrn.s32     d31, q3,  #12                // t15a
         vmull_vmlal     q2,  d25, d23, d2[3], d2[2]  // -> t14a
         vmull_vmlsl     q3,  d21, d27, d3[0], d3[1]  // -> t10a
-        vrshrn.i32      d23, q4,  #12                // t9a
-        vrshrn.i32      d25, q2,  #12                // t14a
+        vqrshrn.s32     d23, q4,  #12                // t9a
+        vqrshrn.s32     d25, q2,  #12                // t14a
         vmull_vmlal     q4,  d21, d27, d3[1], d3[0]  // -> t13a
         vmull_vmlsl     q2,  d29, d19, d3[2], d3[3]  // -> t11a
-        vrshrn.i32      d21, q3,  #12                // t10a
-        vrshrn.i32      d27, q4,  #12                // t13a
+        vqrshrn.s32     d21, q3,  #12                // t10a
+        vqrshrn.s32     d27, q4,  #12                // t13a
         vmull_vmlal     q3,  d29, d19, d3[3], d3[2]  // -> t12a
-        vrshrn.i32      d19, q2,  #12                // t11a
-        vrshrn.i32      d29, q3,  #12                // t12a
+        vqrshrn.s32     d19, q2,  #12                // t11a
+        vqrshrn.s32     d29, q3,  #12                // t12a
 
         idct_4h_x8      d16, d18, d20, d22, d24, d26, d28, d30
 
@@ -1149,14 +1149,14 @@ function inv_dct_4h_x16_neon, export=1
 
         vmull_vmlsl     q3,  d5,  d4,  d0[2], d0[3]  // -> t9a
         vmull_vmlal     q4,  d5,  d4,  d0[3], d0[2]  // -> t14a
-        vrshrn.i32      d21, q3,  #12                // t9a
-        vrshrn.i32      d27, q4,  #12                // t14a
+        vqrshrn.s32     d21, q3,  #12                // t9a
+        vqrshrn.s32     d27, q4,  #12                // t14a
 
         vmull_vmlsl     q3,  d29, d23, d0[2], d0[3]  // -> t13a
         vmull_vmlal     q4,  d29, d23, d0[3], d0[2]  // -> t10a
-        vrshrn.i32      d29, q3,  #12                // t13a
+        vqrshrn.s32     d29, q3,  #12                // t13a
         vneg.s32        q4,  q4
-        vrshrn.i32      d23, q4,  #12                // t10a
+        vqrshrn.s32     d23, q4,  #12                // t10a
 
         vqsub.s16       d4,  d17, d19  // t11a
         vqadd.s16       d17, d17, d19  // t8a
@@ -1171,11 +1171,11 @@ function inv_dct_4h_x16_neon, export=1
         vmull_vmlal     q4,  d5,  d4,  d0[0], d0[0]  // -> t12
         vmull_vmlsl     q2,  d25, d21, d0[0], d0[0]  // -> t10a
 
-        vrshrn.i32      d6,  q3,  #12  // t11
-        vrshrn.i32      d7,  q4,  #12  // t12
+        vqrshrn.s32     d6,  q3,  #12  // t11
+        vqrshrn.s32     d7,  q4,  #12  // t12
         vmull_vmlal     q4,  d25, d21, d0[0], d0[0]  // -> t13a
-        vrshrn.i32      d4,  q2,  #12  // t10a
-        vrshrn.i32      d5,  q4,  #12  // t13a
+        vqrshrn.s32     d4,  q2,  #12  // t10a
+        vqrshrn.s32     d5,  q4,  #12  // t13a
 
         vqadd.s16       d8,  d16, d31  // out0
         vqsub.s16       d31, d16, d31  // out15
@@ -1208,35 +1208,35 @@ endfunc
         vmull_vmlal     q2,  d31, d16, d0[0], d0[1] // -> t0
         vmull_vmlsl     q3,  d31, d16, d0[1], d0[0] // -> t1
         vmull_vmlal     q4,  d29, d18, d0[2], d0[3] // -> t2
-        vrshrn.i32      d16, q2,  #12               // t0
-        vrshrn.i32      d31, q3,  #12               // t1
+        vqrshrn.s32     d16, q2,  #12               // t0
+        vqrshrn.s32     d31, q3,  #12               // t1
         vmull_vmlsl     q2,  d29, d18, d0[3], d0[2] // -> t3
         vmull_vmlal     q3,  d27, d20, d1[0], d1[1] // -> t4
-        vrshrn.i32      d18, q4,  #12               // t2
-        vrshrn.i32      d29, q2,  #12               // t3
+        vqrshrn.s32     d18, q4,  #12               // t2
+        vqrshrn.s32     d29, q2,  #12               // t3
         vmull_vmlsl     q4,  d27, d20, d1[1], d1[0] // -> t5
         vmull_vmlal     q2,  d25, d22, d1[2], d1[3] // -> t6
-        vrshrn.i32      d20, q3,  #12               // t4
-        vrshrn.i32      d27, q4,  #12               // t5
+        vqrshrn.s32     d20, q3,  #12               // t4
+        vqrshrn.s32     d27, q4,  #12               // t5
         vmull_vmlsl     q3,  d25, d22, d1[3], d1[2] // -> t7
         vmull_vmlal     q4,  d23, d24, d2[0], d2[1] // -> t8
-        vrshrn.i32      d22, q2,  #12               // t6
-        vrshrn.i32      d25, q3,  #12               // t7
+        vqrshrn.s32     d22, q2,  #12               // t6
+        vqrshrn.s32     d25, q3,  #12               // t7
         vmull_vmlsl     q2,  d23, d24, d2[1], d2[0] // -> t9
         vmull_vmlal     q3,  d21, d26, d2[2], d2[3] // -> t10
-        vrshrn.i32      d23, q4,  #12               // t8
-        vrshrn.i32      d24, q2,  #12               // t9
+        vqrshrn.s32     d23, q4,  #12               // t8
+        vqrshrn.s32     d24, q2,  #12               // t9
         vmull_vmlsl     q4,  d21, d26, d2[3], d2[2] // -> t11
         vmull_vmlal     q2,  d19, d28, d3[0], d3[1] // -> t12
-        vrshrn.i32      d21, q3,  #12               // t10
-        vrshrn.i32      d26, q4,  #12               // t11
+        vqrshrn.s32     d21, q3,  #12               // t10
+        vqrshrn.s32     d26, q4,  #12               // t11
         vmull_vmlsl     q3,  d19, d28, d3[1], d3[0] // -> t13
         vmull_vmlal     q4,  d17, d30, d3[2], d3[3] // -> t14
-        vrshrn.i32      d19, q2,  #12               // t12
-        vrshrn.i32      d28, q3,  #12               // t13
+        vqrshrn.s32     d19, q2,  #12               // t12
+        vqrshrn.s32     d28, q3,  #12               // t13
         vmull_vmlsl     q2,  d17, d30, d3[3], d3[2] // -> t15
-        vrshrn.i32      d17, q4,  #12               // t14
-        vrshrn.i32      d30, q2,  #12               // t15
+        vqrshrn.s32     d17, q4,  #12               // t14
+        vqrshrn.s32     d30, q2,  #12               // t15
 
         vld1.16         {q0}, [r12, :128]
 
@@ -1260,19 +1260,19 @@ endfunc
         vmull_vmlal     q2,  d2,  d3,  d1[1], d1[0] // -> t8
         vmull_vmlsl     q3,  d2,  d3,  d1[0], d1[1] // -> t9
         vmull_vmlal     q4,  d18, d29, d1[3], d1[2] // -> t10
-        vrshrn.i32      d17, q2,  #12               // t8
-        vrshrn.i32      d30, q3,  #12               // t9
+        vqrshrn.s32     d17, q2,  #12               // t8
+        vqrshrn.s32     d30, q3,  #12               // t9
         vmull_vmlsl     q2,  d18, d29, d1[2], d1[3] // -> t11
         vmull_vmlsl     q3,  d27, d20, d1[1], d1[0] // -> t12
-        vrshrn.i32      d18, q4,  #12               // t10
-        vrshrn.i32      d29, q2,  #12               // t11
+        vqrshrn.s32     d18, q4,  #12               // t10
+        vqrshrn.s32     d29, q2,  #12               // t11
         vmull_vmlal     q4,  d27, d20, d1[0], d1[1] // -> t13
         vmull_vmlsl     q2,  d25, d22, d1[3], d1[2] // -> t14
-        vrshrn.i32      d27, q3,  #12               // t12
-        vrshrn.i32      d20, q4,  #12               // t13
+        vqrshrn.s32     d27, q3,  #12               // t12
+        vqrshrn.s32     d20, q4,  #12               // t13
         vmull_vmlal     q3,  d25, d22, d1[2], d1[3] // -> t15
-        vrshrn.i32      d25, q2,  #12               // t14
-        vrshrn.i32      d22, q3,  #12               // t15
+        vqrshrn.s32     d25, q2,  #12               // t14
+        vqrshrn.s32     d22, q3,  #12               // t15
 
         vqsub.s16       d2,  d16, d21 // t4
         vqadd.s16       d16, d16, d21 // t0
@@ -1294,19 +1294,19 @@ endfunc
         vmull_vmlal     q2,  d2,  d3,  d0[3], d0[2] // -> t4a
         vmull_vmlsl     q3,  d2,  d3,  d0[2], d0[3] // -> t5a
         vmull_vmlsl     q4,  d24, d23, d0[3], d0[2] // -> t6a
-        vrshrn.i32      d22, q2,  #12               // t4a
-        vrshrn.i32      d25, q3,  #12               // t5a
+        vqrshrn.s32     d22, q2,  #12               // t4a
+        vqrshrn.s32     d25, q3,  #12               // t5a
         vmull_vmlal     q2,  d24, d23, d0[2], d0[3] // -> t7a
         vmull_vmlal     q3,  d17, d30, d0[3], d0[2] // -> t12
-        vrshrn.i32      d24, q4,  #12               // t6a
-        vrshrn.i32      d23, q2,  #12               // t7a
+        vqrshrn.s32     d24, q4,  #12               // t6a
+        vqrshrn.s32     d23, q2,  #12               // t7a
         vmull_vmlsl     q4,  d17, d30, d0[2], d0[3] // -> t13
         vmull_vmlsl     q2,  d29, d18, d0[3], d0[2] // -> t14
-        vrshrn.i32      d17, q3,  #12               // t12
+        vqrshrn.s32     d17, q3,  #12               // t12
         vmull_vmlal     q3,  d29, d18, d0[2], d0[3] // -> t15
-        vrshrn.i32      d29, q4,  #12               // t13
-        vrshrn.i32      d30, q2,  #12               // t14
-        vrshrn.i32      d18, q3,  #12               // t15
+        vqrshrn.s32     d29, q4,  #12               // t13
+        vqrshrn.s32     d30, q2,  #12               // t14
+        vqrshrn.s32     d18, q3,  #12               // t15
 
         vqsub.s16       d2,  d16, d21 // t2a
 .ifc \o0, d16
@@ -1343,21 +1343,21 @@ endfunc
         vmull_vmlal     q2,  d2,  d21, d0[0], d0[0] // -> out7 (d23 or d24)
         vmull_vmlal     q3,  d26, d3,  d0[0], d0[0] // -> out5 (d21 or d26)
 
-        vrshrn.i32      d24, q12, #12 // out8
-        vrshrn.i32      d4,  q2,  #12 // out7
-        vrshrn.i32      d5,  q3,  #12 // out5
+        vqrshrn.s32     d24, q12, #12 // out8
+        vqrshrn.s32     d4,  q2,  #12 // out7
+        vqrshrn.s32     d5,  q3,  #12 // out5
         vmull_vmlsl     q4,  d26, d3,  d0[0], d0[0] // -> out10 (d26 or d21)
         vmull_vmlal     q1,  d22, d23, d0[0], d0[0] // -> out4 (d20 or d27)
-        vrshrn.i32      d26, q4,  #12 // out10
+        vqrshrn.s32     d26, q4,  #12 // out10
 
         vmull_vmlsl     q4,  d22, d23, d0[0], d0[0] // -> out11 (d27 or d20)
         vmull_vmlal     q11, d27, d20, d0[0], d0[0] // -> out6 (d22 or d25)
         vmull_vmlsl     q3,  d27, d20, d0[0], d0[0] // -> out9 (d25 or d22)
 
-        vrshrn.i32      \o4, q1,  #12 // out4
-        vrshrn.i32      d7,  q3,  #12 // out9
-        vrshrn.i32      d6,  q4,  #12 // out11
-        vrshrn.i32      \o6, q11, #12 // out6
+        vqrshrn.s32     \o4, q1,  #12 // out4
+        vqrshrn.s32     d7,  q3,  #12 // out9
+        vqrshrn.s32     d6,  q4,  #12 // out11
+        vqrshrn.s32     \o6, q11, #12 // out6
 
 .ifc \o8, d23
         vmov            \o8, d24
@@ -1927,35 +1927,35 @@ function inv_dct32_odd_4h_x16_neon, export=1
         vmull_vmlsl     q2,  d16, d31, d0[0], d0[1] // -> t16a
         vmull_vmlal     q3,  d16, d31, d0[1], d0[0] // -> t31a
         vmull_vmlsl     q4,  d24, d23, d0[2], d0[3] // -> t17a
-        vrshrn.i32      d16, q2,  #12               // t16a
-        vrshrn.i32      d31, q3,  #12               // t31a
+        vqrshrn.s32     d16, q2,  #12               // t16a
+        vqrshrn.s32     d31, q3,  #12               // t31a
         vmull_vmlal     q2,  d24, d23, d0[3], d0[2] // -> t30a
         vmull_vmlsl     q3,  d20, d27, d1[0], d1[1] // -> t18a
-        vrshrn.i32      d24, q4,  #12               // t17a
-        vrshrn.i32      d23, q2,  #12               // t30a
+        vqrshrn.s32     d24, q4,  #12               // t17a
+        vqrshrn.s32     d23, q2,  #12               // t30a
         vmull_vmlal     q4,  d20, d27, d1[1], d1[0] // -> t29a
         vmull_vmlsl     q2,  d28, d19, d1[2], d1[3] // -> t19a
-        vrshrn.i32      d20, q3,  #12               // t18a
-        vrshrn.i32      d27, q4,  #12               // t29a
+        vqrshrn.s32     d20, q3,  #12               // t18a
+        vqrshrn.s32     d27, q4,  #12               // t29a
         vmull_vmlal     q3,  d28, d19, d1[3], d1[2] // -> t28a
         vmull_vmlsl     q4,  d18, d29, d2[0], d2[1] // -> t20a
-        vrshrn.i32      d28, q2,  #12               // t19a
-        vrshrn.i32      d19, q3,  #12               // t28a
+        vqrshrn.s32     d28, q2,  #12               // t19a
+        vqrshrn.s32     d19, q3,  #12               // t28a
         vmull_vmlal     q2,  d18, d29, d2[1], d2[0] // -> t27a
         vmull_vmlsl     q3,  d26, d21, d2[2], d2[3] // -> t21a
-        vrshrn.i32      d18, q4,  #12               // t20a
-        vrshrn.i32      d29, q2,  #12               // t27a
+        vqrshrn.s32     d18, q4,  #12               // t20a
+        vqrshrn.s32     d29, q2,  #12               // t27a
         vmull_vmlal     q4,  d26, d21, d2[3], d2[2] // -> t26a
         vmull_vmlsl     q2,  d22, d25, d3[0], d3[1] // -> t22a
-        vrshrn.i32      d26, q3,  #12               // t21a
-        vrshrn.i32      d21, q4,  #12               // t26a
+        vqrshrn.s32     d26, q3,  #12               // t21a
+        vqrshrn.s32     d21, q4,  #12               // t26a
         vmull_vmlal     q3,  d22, d25, d3[1], d3[0] // -> t25a
         vmull_vmlsl     q4,  d30, d17, d3[2], d3[3] // -> t23a
-        vrshrn.i32      d22, q2,  #12               // t22a
-        vrshrn.i32      d25, q3,  #12               // t25a
+        vqrshrn.s32     d22, q2,  #12               // t22a
+        vqrshrn.s32     d25, q3,  #12               // t25a
         vmull_vmlal     q2,  d30, d17, d3[3], d3[2] // -> t24a
-        vrshrn.i32      d30, q4,  #12               // t23a
-        vrshrn.i32      d17, q2,  #12               // t24a
+        vqrshrn.s32     d30, q4,  #12               // t23a
+        vqrshrn.s32     d17, q2,  #12               // t24a
 
         vld1.16         {q0}, [r12, :128]
 
@@ -1979,21 +1979,21 @@ function inv_dct32_odd_4h_x16_neon, export=1
         vmull_vmlsl     q2,  d3,  d2,  d1[0], d1[1] // -> t17a
         vmull_vmlal     q3,  d3,  d2,  d1[1], d1[0] // -> t30a
         vmull_vmlal     q4,  d19, d24, d1[1], d1[0] // -> t18a
-        vrshrn.i32      d21, q2,  #12               // t17a
-        vrshrn.i32      d27, q3,  #12               // t30a
+        vqrshrn.s32     d21, q2,  #12               // t17a
+        vqrshrn.s32     d27, q3,  #12               // t30a
         vneg.s32        q4,  q4                     // -> t18a
         vmull_vmlsl     q1,  d19, d24, d1[0], d1[1] // -> t29a
         vmull_vmlsl     q2,  d22, d18, d1[2], d1[3] // -> t21a
-        vrshrn.i32      d19, q4,  #12               // t18a
-        vrshrn.i32      d24, q1,  #12               // t29a
+        vqrshrn.s32     d19, q4,  #12               // t18a
+        vqrshrn.s32     d24, q1,  #12               // t29a
         vmull_vmlal     q3,  d22, d18, d1[3], d1[2] // -> t26a
         vmull_vmlal     q4,  d17, d20, d1[3], d1[2] // -> t22a
-        vrshrn.i32      d22, q2,  #12               // t21a
-        vrshrn.i32      d18, q3,  #12               // t26a
+        vqrshrn.s32     d22, q2,  #12               // t21a
+        vqrshrn.s32     d18, q3,  #12               // t26a
         vneg.s32        q4,  q4                     // -> t22a
         vmull_vmlsl     q1,  d17, d20, d1[2], d1[3] // -> t25a
-        vrshrn.i32      d17, q4,  #12               // t22a
-        vrshrn.i32      d20, q1,  #12               // t25a
+        vqrshrn.s32     d17, q4,  #12               // t22a
+        vqrshrn.s32     d20, q1,  #12               // t25a
 
         vqsub.s16       d2,  d27, d24 // t29
         vqadd.s16       d27, d27, d24 // t30
@@ -2015,21 +2015,21 @@ function inv_dct32_odd_4h_x16_neon, export=1
         vmull_vmlsl     q2,  d2,  d3,  d0[2], d0[3] // -> t18a
         vmull_vmlal     q3,  d2,  d3,  d0[3], d0[2] // -> t29a
         vmull_vmlsl     q4,  d29, d24, d0[2], d0[3] // -> t19
-        vrshrn.i32      d18, q2,  #12               // t18a
-        vrshrn.i32      d25, q3,  #12               // t29a
+        vqrshrn.s32     d18, q2,  #12               // t18a
+        vqrshrn.s32     d25, q3,  #12               // t29a
         vmull_vmlal     q1,  d29, d24, d0[3], d0[2] // -> t28
         vmull_vmlal     q2,  d26, d19, d0[3], d0[2] // -> t20
-        vrshrn.i32      d29, q4,  #12               // t19
-        vrshrn.i32      d24, q1,  #12               // t28
+        vqrshrn.s32     d29, q4,  #12               // t19
+        vqrshrn.s32     d24, q1,  #12               // t28
         vneg.s32        q2,  q2                     // -> t20
         vmull_vmlsl     q3,  d26, d19, d0[2], d0[3] // -> t27
         vmull_vmlal     q4,  d20, d28, d0[3], d0[2] // -> t21a
-        vrshrn.i32      d26, q2,  #12               // t20
-        vrshrn.i32      d19, q3,  #12               // t27
+        vqrshrn.s32     d26, q2,  #12               // t20
+        vqrshrn.s32     d19, q3,  #12               // t27
         vneg.s32        q4,  q4                     // -> t21a
         vmull_vmlsl     q1,  d20, d28, d0[2], d0[3] // -> t26a
-        vrshrn.i32      d20, q4,  #12               // t21a
-        vrshrn.i32      d28, q1,  #12               // t26a
+        vqrshrn.s32     d20, q4,  #12               // t21a
+        vqrshrn.s32     d28, q1,  #12               // t26a
 
         vqsub.s16       d2,  d16, d30 // t23
         vqadd.s16       d16, d16, d30 // t16 = out16
@@ -2051,24 +2051,24 @@ function inv_dct32_odd_4h_x16_neon, export=1
 
         vmull_vmlsl     q2,  d24, d26, d0[0], d0[0] // -> t20
         vmull_vmlal     q3,  d24, d26, d0[0], d0[0] // -> t27
-        vrshrn.i32      d20, q2,  #12   // t20
-        vrshrn.i32      d22, q3,  #12   // t27
+        vqrshrn.s32     d20, q2,  #12   // t20
+        vqrshrn.s32     d22, q3,  #12   // t27
 
         vmull_vmlal     q2,  d25, d27, d0[0], d0[0] // -> t26a
         vmull_vmlsl     q3,  d25, d27, d0[0], d0[0] // -> t21a
         vmov            d27, d22        // t27
-        vrshrn.i32      d26, q2,  #12   // t26a
+        vqrshrn.s32     d26, q2,  #12   // t26a
 
         vmull_vmlsl     q12, d21, d23, d0[0], d0[0] // -> t22
         vmull_vmlal     q2,  d21, d23, d0[0], d0[0] // -> t25
-        vrshrn.i32      d21, q3,  #12   // t21a
-        vrshrn.i32      d22, q12, #12   // t22
-        vrshrn.i32      d25, q2,  #12   // t25
+        vqrshrn.s32     d21, q3,  #12   // t21a
+        vqrshrn.s32     d22, q12, #12   // t22
+        vqrshrn.s32     d25, q2,  #12   // t25
 
         vmull_vmlsl     q2,  d3,  d2,  d0[0], d0[0] // -> t23a
         vmull_vmlal     q3,  d3,  d2,  d0[0], d0[0] // -> t24a
-        vrshrn.i32      d23, q2,  #12   // t23a
-        vrshrn.i32      d24, q3,  #12   // t24a
+        vqrshrn.s32     d23, q2,  #12   // t23a
+        vqrshrn.s32     d24, q3,  #12   // t24a
 
         bx              lr
 endfunc
@@ -2679,11 +2679,11 @@ function inv_dct64_step1_neon
         vmull_vmlsl     q3,  d29, d26, d2[1], d2[0] // -> t61a
         vneg.s32        q2,  q2                     // t34a
         vmull_vmlsl     q4,  d30, d25, d2[1], d2[0] // -> t33a
-        vrshrn.i32      d26, q2,  #12               // t34a
+        vqrshrn.s32     d26, q2,  #12               // t34a
         vmull_vmlal     q2,  d30, d25, d2[0], d2[1] // -> t62a
-        vrshrn.i32      d29, q3,  #12               // t61a
-        vrshrn.i32      d25, q4,  #12               // t33a
-        vrshrn.i32      d30, q2,  #12               // t62a
+        vqrshrn.s32     d29, q3,  #12               // t61a
+        vqrshrn.s32     d25, q4,  #12               // t33a
+        vqrshrn.s32     d30, q2,  #12               // t62a
 
         vqadd.s16       d16, d24, d27    // t32a
         vqsub.s16       d19, d24, d27    // t35a
@@ -2697,11 +2697,11 @@ function inv_dct64_step1_neon
         vmull_vmlal     q2,  d21, d18, d2[2], d2[3] // -> t61a
         vmull_vmlsl     q3,  d21, d18, d2[3], d2[2] // -> t34a
         vmull_vmlal     q4,  d20, d19, d2[2], d2[3] // -> t60
-        vrshrn.i32      d21, q2,  #12               // t61a
-        vrshrn.i32      d18, q3,  #12               // t34a
+        vqrshrn.s32     d21, q2,  #12               // t61a
+        vqrshrn.s32     d18, q3,  #12               // t34a
         vmull_vmlsl     q2,  d20, d19, d2[3], d2[2] // -> t35
-        vrshrn.i32      d20, q4,  #12               // t60
-        vrshrn.i32      d19, q2,  #12               // t35
+        vqrshrn.s32     d20, q4,  #12               // t60
+        vqrshrn.s32     d19, q2,  #12               // t35
 
         vst1.16         {d16, d17, d18, d19}, [r6, :128]!
         vst1.16         {d20, d21, d22, d23}, [r6, :128]!
@@ -2738,12 +2738,12 @@ function inv_dct64_step2_neon
         vmull_vmlal     q2,  d27, d25, d0[3], d0[2] // -> t56a
         vmull_vmlsl     q3,  d27, d25, d0[2], d0[3] // -> t39a
         vmull_vmlal     q4,  d31, d28, d0[3], d0[2] // -> t40a
-        vrshrn.i32      d25, q2,  #12               // t56a
-        vrshrn.i32      d27, q3,  #12               // t39a
+        vqrshrn.s32     d25, q2,  #12               // t56a
+        vqrshrn.s32     d27, q3,  #12               // t39a
         vneg.s32        q4,  q4                     // t40a
         vmull_vmlsl     q2,  d31, d28, d0[2], d0[3] // -> t55a
-        vrshrn.i32      d31, q4,  #12               // t40a
-        vrshrn.i32      d28, q2,  #12               // t55a
+        vqrshrn.s32     d31, q4,  #12               // t40a
+        vqrshrn.s32     d28, q2,  #12               // t55a
 
         vqadd.s16       d16, d24, d29      // t32a
         vqsub.s16       d19, d24, d29      // t47a
@@ -2757,11 +2757,11 @@ function inv_dct64_step2_neon
         vmull_vmlsl     q2,  d21, d18, d0[0], d0[0] // -> t40a
         vmull_vmlal     q3,  d21, d18, d0[0], d0[0] // -> t55a
         vmull_vmlsl     q4,  d20, d19, d0[0], d0[0] // -> t47
-        vrshrn.i32      d18, q2,  #12               // t40a
-        vrshrn.i32      d21, q3,  #12               // t55a
+        vqrshrn.s32     d18, q2,  #12               // t40a
+        vqrshrn.s32     d21, q3,  #12               // t55a
         vmull_vmlal     q2,  d20, d19, d0[0], d0[0] // -> t48
-        vrshrn.i32      d19, q4,  #12               // t47
-        vrshrn.i32      d20, q2,  #12               // t48
+        vqrshrn.s32     d19, q4,  #12               // t47
+        vqrshrn.s32     d20, q2,  #12               // t48
 
         vstr            d16, [r6, #2*4*0]  // t32a
         vstr            d17, [r9, #2*4*0]  // t39
diff --git a/third_party/dav1d/src/arm/64/filmgrain.S b/third_party/dav1d/src/arm/64/filmgrain.S
new file mode 100644
index 0000000000000..6cdd7ec5fa399
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/filmgrain.S
@@ -0,0 +1,2010 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "src/arm/asm-offsets.h"
+
+#define GRAIN_WIDTH 82
+#define GRAIN_HEIGHT 73
+
+#define SUB_GRAIN_WIDTH 44
+#define SUB_GRAIN_HEIGHT 38
+
+.macro increment_seed steps, shift=1
+        lsr             w11, w2,  #3
+        lsr             w12, w2,  #12
+        lsr             w13, w2,  #1
+        eor             w11, w2,  w11                     // (r >> 0) ^ (r >> 3)
+        eor             w12, w12, w13                     // (r >> 12) ^ (r >> 1)
+        eor             w11, w11, w12                     // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1)
+.if \shift
+        lsr             w2,  w2,  #\steps
+.endif
+        and             w11, w11, #((1 << \steps) - 1)    // bit
+.if \shift
+        orr             w2,  w2,  w11, lsl #(16 - \steps) // *state
+.else
+        orr             w2,  w2,  w11, lsl #16            // *state
+.endif
+.endm
+
+.macro read_rand dest, bits, age
+        ubfx            \dest,  x2,   #16 - \bits - \age, #\bits
+.endm
+
+.macro read_shift_rand dest, bits
+        ubfx            \dest,  x2,   #17 - \bits, #\bits
+        lsr             w2,  w2,  #1
+.endm
+
+// special calling convention:
+// w2 holds seed
+// x3 holds dav1d_gaussian_sequence
+// clobbers x11-x15
+// returns in v0.8h
+function get_gaussian_neon
+        increment_seed  4
+        read_rand       x14, 11,  3
+        read_rand       x15, 11,  2
+        add             x14, x3,  x14, lsl #1
+        add             x15, x3,  x15, lsl #1
+        ld1             {v0.h}[0], [x14]
+        read_rand       x14, 11,  1
+        ld1             {v0.h}[1], [x15]
+        add             x14, x3,  x14, lsl #1
+        read_rand       x15, 11,  0
+        increment_seed  4
+        add             x15, x3,  x15, lsl #1
+        ld1             {v0.h}[2], [x14]
+        read_rand       x14, 11,  3
+        ld1             {v0.h}[3], [x15]
+        add             x14, x3,  x14, lsl #1
+        read_rand       x15, 11,  2
+        ld1             {v0.h}[4], [x14]
+        add             x15, x3,  x15, lsl #1
+        read_rand       x14, 11,  1
+        ld1             {v0.h}[5], [x15]
+        read_rand       x15, 11,  0
+        add             x14, x3,  x14, lsl #1
+        add             x15, x3,  x15, lsl #1
+        ld1             {v0.h}[6], [x14]
+        ld1             {v0.h}[7], [x15]
+        ret
+endfunc
+
+.macro get_grain_row r0, r1, r2, r3, r4, r5
+        bl              get_gaussian_neon
+        srshl           \r5\().8h,  v0.8h,  v31.8h
+        xtn             \r0\().8b,  \r5\().8h
+        bl              get_gaussian_neon
+        srshl           \r5\().8h,  v0.8h,  v31.8h
+        xtn2            \r0\().16b, \r5\().8h
+        bl              get_gaussian_neon
+        srshl           \r5\().8h,  v0.8h,  v31.8h
+        xtn             \r1\().8b,  \r5\().8h
+        bl              get_gaussian_neon
+        srshl           \r5\().8h,  v0.8h,  v31.8h
+        xtn2            \r1\().16b, \r5\().8h
+        bl              get_gaussian_neon
+        srshl           \r5\().8h,  v0.8h,  v31.8h
+        xtn             \r2\().8b,  \r5\().8h
+        bl              get_gaussian_neon
+        srshl           \r5\().8h,  v0.8h,  v31.8h
+        xtn2            \r2\().16b, \r5\().8h
+        bl              get_gaussian_neon
+        srshl           \r5\().8h,  v0.8h,  v31.8h
+        xtn             \r3\().8b,  \r5\().8h
+        bl              get_gaussian_neon
+        srshl           \r5\().8h,  v0.8h,  v31.8h
+        xtn2            \r3\().16b, \r5\().8h
+        bl              get_gaussian_neon
+        srshl           \r5\().8h,  v0.8h,  v31.8h
+        xtn             \r4\().8b,  \r5\().8h
+        bl              get_gaussian_neon
+        srshl           \r5\().8h,  v0.8h,  v31.8h
+        xtn2            \r4\().16b, \r5\().8h
+        increment_seed  2
+        read_rand       x14, 11,  1
+        read_rand       x15, 11,  0
+        add             x14, x3,  x14, lsl #1
+        add             x15, x3,  x15, lsl #1
+        ld1             {\r5\().h}[0], [x14]
+        ld1             {\r5\().h}[1], [x15]
+        srshl           v0.4h,      \r5\().4h,  v31.4h
+        xtn             \r5\().8b,  v0.8h
+.endm
+
+.macro store_grain_row r0, r1, r2, r3, r4, r5
+        st1             {\r0\().16b,\r1\().16b}, [x0], #32
+        st1             {\r2\().16b,\r3\().16b}, [x0], #32
+        st1             {\r4\().16b},  [x0], #16
+        st1             {\r5\().h}[0], [x0], #2
+.endm
+
+.macro get_grain_row_44 r0, r1, r2
+        bl              get_gaussian_neon
+        srshl           \r2\().8h,  v0.8h,  v31.8h
+        xtn             \r0\().8b,  \r2\().8h
+        bl              get_gaussian_neon
+        srshl           \r2\().8h,  v0.8h,  v31.8h
+        xtn2            \r0\().16b, \r2\().8h
+        bl              get_gaussian_neon
+        srshl           \r2\().8h,  v0.8h,  v31.8h
+        xtn             \r1\().8b,  \r2\().8h
+        bl              get_gaussian_neon
+        srshl           \r2\().8h,  v0.8h,  v31.8h
+        xtn2            \r1\().16b, \r2\().8h
+        bl              get_gaussian_neon
+        srshl           \r2\().8h,  v0.8h,  v31.8h
+        xtn             \r2\().8b,  \r2\().8h
+
+        increment_seed  4
+        read_rand       x14, 11,  3
+        read_rand       x15, 11,  2
+        add             x14, x3,  x14, lsl #1
+        add             x15, x3,  x15, lsl #1
+        ld1             {v0.h}[0], [x14]
+        read_rand       x14, 11,  1
+        ld1             {v0.h}[1], [x15]
+        read_rand       x15, 11,  0
+        add             x14, x3,  x14, lsl #1
+        add             x15, x3,  x15, lsl #1
+        ld1             {v0.h}[2], [x14]
+        ld1             {v0.h}[3], [x15]
+        srshl           v0.4h,      v0.4h,  v31.4h
+        xtn2            \r2\().16b, v0.8h
+.endm
+
+.macro store_grain_row_44 r0, r1, r2
+        st1             {\r0\().16b,\r1\().16b}, [x0], #32
+        st1             {\r2\().16b},  [x0]
+        add             x0,  x0,  #GRAIN_WIDTH-32
+.endm
+
+function get_grain_2_neon
+        increment_seed  2
+        read_rand       x14, 11,  1
+        read_rand       x15, 11,  0
+        add             x14, x3,  x14, lsl #1
+        add             x15, x3,  x15, lsl #1
+        ld1             {v0.h}[0], [x14]
+        ld1             {v0.h}[1], [x15]
+        srshl           v0.4h,   v0.4h,   v31.4h
+        xtn             v0.8b,   v0.8h
+        ret
+endfunc
+
+.macro get_grain_2 dst
+        bl              get_grain_2_neon
+.ifnc \dst, v0
+        mov             \dst\().8b, v0.8b
+.endif
+.endm
+
+// w15 holds the number of entries to produce
+// w14, w16 and w17 hold the previous output entries
+// v0 holds the vector of produced entries
+// v1 holds the input vector of sums from above
+.macro output_lag n
+function output_lag\n\()_neon
+1:
+        read_shift_rand x13, 11
+        mov             w11, v1.s[0]
+        ldrsh           w12, [x3, x13, lsl #1]
+        ext             v0.16b,  v0.16b,  v0.16b,  #1
+.if \n == 1
+        madd            w11, w14, w4,  w11        // sum (above) + *coeff * prev output
+.elseif \n == 2
+        madd            w11, w16, w4,  w11        // sum (above) + *coeff * prev output 1
+        madd            w11, w14, w17, w11        // += *coeff * prev output 2
+        mov             w16, w14
+.else
+        madd            w11, w17, w4,  w11        // sum (above) + *coeff * prev output 1
+        madd            w11, w16, w20, w11        // sum (above) + *coeff * prev output 2
+        madd            w11, w14, w21, w11        // += *coeff * prev output 3
+        mov             w17, w16
+        mov             w16, w14
+.endif
+        add             w14, w11, w8              // 1 << (ar_coeff_shift - 1)
+        add             w12, w12, w10             // 1 << (4 + grain_scale_shift - 1)
+        asr             w14, w14, w7              // >> ar_coeff_shift
+        asr             w12, w12, w9              // >> (4 + grain_scale_shift)
+        add             w14, w14, w12
+        cmp             w14, w5
+        csel            w14, w14, w5,  le
+        cmp             w14, w6
+        csel            w14, w14, w6,  ge
+        subs            w15, w15, #1
+        ext             v1.16b,  v1.16b,  v1.16b,  #4
+        ins             v0.b[15], w14
+        b.gt            1b
+        ret
+endfunc
+.endm
+
+output_lag 1
+output_lag 2
+output_lag 3
+
+
+function sum_lag1_above_neon
+        smull           v2.8h,   v3.8b,   v28.8b
+        smull2          v3.8h,   v3.16b,  v28.16b
+        smull           v4.8h,   v0.8b,   v27.8b
+        smull2          v5.8h,   v0.16b,  v27.16b
+        smull           v6.8h,   v1.8b,   v29.8b
+        smull2          v7.8h,   v1.16b,  v29.16b
+        saddl           v0.4s,   v2.4h,   v4.4h
+        saddl2          v1.4s,   v2.8h,   v4.8h
+        saddl           v2.4s,   v3.4h,   v5.4h
+        saddl2          v3.4s,   v3.8h,   v5.8h
+        saddw           v4.4s,   v0.4s,   v6.4h
+        saddw2          v5.4s,   v1.4s,   v6.8h
+        saddw           v6.4s,   v2.4s,   v7.4h
+        saddw2          v7.4s,   v3.4s,   v7.8h
+        ret
+endfunc
+
+.macro sum_lag_n_body lag, type, uv_layout, edge, elems, store, uv_coeff
+        bl              sum_\lag\()_above_neon
+.ifc \type, uv_420
+        add             x12, x19, #GRAIN_WIDTH
+        ld1             {v22.16b, v23.16b}, [x19], #32
+        ld1             {v24.16b, v25.16b}, [x12]
+        saddlp          v22.8h,  v22.16b
+        saddlp          v23.8h,  v23.16b
+        saddlp          v24.8h,  v24.16b
+        saddlp          v25.8h,  v25.16b
+        add             v22.8h,  v22.8h,  v24.8h
+        add             v23.8h,  v23.8h,  v25.8h
+        rshrn           v0.8b,   v22.8h,  #2
+        rshrn2          v0.16b,  v23.8h,  #2
+.endif
+.ifc \type, uv_422
+        ld1             {v22.16b, v23.16b}, [x19], #32
+        saddlp          v22.8h,  v22.16b
+        saddlp          v23.8h,  v23.16b
+        rshrn           v0.8b,   v22.8h,  #1
+        rshrn2          v0.16b,  v23.8h,  #1
+.endif
+.ifc \type, uv_444
+        ld1             {v0.16b}, [x19], #16
+.endif
+.if \uv_layout
+.ifnb \uv_coeff
+        dup             v1.16b,  \uv_coeff
+        smull           v2.8h,   v0.8b,   v1.8b
+        smull2          v3.8h,   v0.16b,  v1.16b
+.else
+        smull           v2.8h,   v0.8b,   v30.8b
+        smull2          v3.8h,   v0.16b,  v30.16b
+.endif
+        saddw           v4.4s,   v4.4s,   v2.4h
+        saddw2          v5.4s,   v5.4s,   v2.8h
+        saddw           v6.4s,   v6.4s,   v3.4h
+        saddw2          v7.4s,   v7.4s,   v3.8h
+.endif
+.if \uv_layout && \elems == 16
+        b               sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 444 && \elems == 15
+        b               sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 422 && \elems == 9
+        b               sum_\lag\()_uv_420_\edge\()_start
+.else
+sum_\lag\()_\type\()_\edge\()_start:
+.ifc \edge, left
+        increment_seed  4
+        read_rand       x12, 11,  3
+        read_rand       x13, 11,  2
+        read_rand       x14, 11,  1
+        add             x12, x3,  x12, lsl #1
+        add             x13, x3,  x13, lsl #1
+        add             x14, x3,  x14, lsl #1
+        ld1             {v0.h}[5], [x12]
+        ld1             {v0.h}[6], [x13]
+        ld1             {v0.h}[7], [x14]
+        lsl             x2,  x2,  #1             // shift back the state as if we'd done increment_seed with shift=0
+        srshl           v0.8h,   v0.8h,   v31.8h
+        xtn2            v0.16b,  v0.8h
+        ext             v4.16b,  v4.16b,  v4.16b,  #12
+.ifc \lag, lag3
+        smov            w17, v0.b[13]
+.endif
+.ifnc \lag, lag1
+        smov            w16, v0.b[14]
+.endif
+        smov            w14, v0.b[15]
+
+        mov             v1.16b,  v4.16b
+        mov             w15, #1
+        bl              output_\lag\()_neon
+.else
+        increment_seed  4, shift=0
+        mov             v1.16b,  v4.16b
+        mov             w15, #4
+        bl              output_\lag\()_neon
+.endif
+
+        increment_seed  4, shift=0
+        mov             v1.16b,  v5.16b
+        mov             w15, #4
+        bl              output_\lag\()_neon
+
+        increment_seed  4, shift=0
+        mov             v1.16b,  v6.16b
+.if \elems == 9
+        mov             w15, #1
+        bl              output_\lag\()_neon
+        lsr             w2,  w2,  #3
+
+        read_rand       x12, 11,  2
+        read_rand       x13, 11,  1
+        read_rand       x14, 11,  0
+        add             x12, x3,  x12, lsl #1
+        add             x13, x3,  x13, lsl #1
+        add             x14, x3,  x14, lsl #1
+        ld1             {v1.h}[0], [x12]
+        ld1             {v1.h}[1], [x13]
+        ld1             {v1.h}[2], [x14]
+        srshl           v1.4h,   v1.4h,   v31.4h
+        xtn             v1.8b,   v1.8h
+        ext             v0.16b,  v0.16b,  v1.16b,  #7
+.else
+        mov             w15, #4
+        bl              output_\lag\()_neon
+
+        increment_seed  4, shift=0
+        mov             v1.16b,  v7.16b
+
+.ifc \edge, right
+        mov             w15, #3
+        bl              output_\lag\()_neon
+        read_shift_rand x15, 11
+        add             x15, x3,  x15, lsl #1
+        ld1             {v1.h}[0], [x15]
+        srshl           v1.4h,   v1.4h,   v31.4h
+        ext             v0.16b,  v0.16b,  v1.16b,  #1
+.else
+        mov             w15, #4
+        bl              output_\lag\()_neon
+.endif
+.endif
+.if \store
+        st1             {v0.16b}, [x0], #16
+.endif
+        ldr             x30, [sp], #16
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+.endif
+.endm
+
+.macro sum_lag1_func type, uv_layout, edge, elems=16
+function sum_\type\()_lag1_\edge\()_neon
+        AARCH64_SIGN_LINK_REGISTER
+        str             x30, [sp, #-16]!
+        sum_lag_n_body  lag1, \type, \uv_layout, \edge, \elems, store=0
+endfunc
+.endm
+
+sum_lag1_func y,      0,   left
+sum_lag1_func y,      0,   mid
+sum_lag1_func y,      0,   right, 15
+sum_lag1_func uv_444, 444, left
+sum_lag1_func uv_444, 444, mid
+sum_lag1_func uv_444, 444, right, 15
+sum_lag1_func uv_422, 422, left
+sum_lag1_func uv_422, 422, mid
+sum_lag1_func uv_422, 422, right, 9
+sum_lag1_func uv_420, 420, left
+sum_lag1_func uv_420, 420, mid
+sum_lag1_func uv_420, 420, right, 9
+
+.macro sum_lag1 type, dst, left, mid, right, edge=mid
+        mov             v3.16b,  \mid\().16b
+        ext             v0.16b,  \left\().16b, \mid\().16b,   #15
+        ext             v1.16b,  \mid\().16b,  \right\().16b, #1
+        bl              sum_\type\()_lag1_\edge\()_neon
+        mov             \dst\().16b, v0.16b
+.endm
+
+.macro sum_y_lag1 dst, left, mid, right, edge=mid
+        sum_lag1        y, \dst, \left, \mid, \right, \edge
+.endm
+
+.macro sum_uv_444_lag1 dst, left, mid, right, edge=mid
+        sum_lag1        uv_444, \dst, \left, \mid, \right, \edge
+.endm
+
+.macro sum_uv_422_lag1 dst, left, mid, right, edge=mid
+        sum_lag1        uv_422, \dst, \left, \mid, \right, \edge
+.endm
+
+.macro sum_uv_420_lag1 dst, left, mid, right, edge=mid
+        sum_lag1        uv_420, \dst, \left, \mid, \right, \edge
+.endm
+
+
+function sum_lag2_above_neon
+        sub             x12, x0,  #2*GRAIN_WIDTH - 16
+        sub             x13, x0,  #1*GRAIN_WIDTH - 16
+        ld1             {v18.16b}, [x12] // load top right
+        ld1             {v21.16b}, [x13]
+
+        ext             v22.16b, v16.16b, v17.16b, #14 // top left, top mid
+        dup             v26.16b, v30.b[0]
+        ext             v23.16b, v16.16b, v17.16b, #15
+        dup             v27.16b, v30.b[1]
+        ext             v0.16b,  v17.16b, v18.16b, #1  // top mid, top right
+        dup             v28.16b, v30.b[3]
+        ext             v1.16b,  v17.16b, v18.16b, #2
+        dup             v29.16b, v30.b[4]
+
+        smull           v2.8h,   v22.8b,  v26.8b
+        smull2          v3.8h,   v22.16b, v26.16b
+        smull           v4.8h,   v23.8b,  v27.8b
+        smull2          v5.8h,   v23.16b, v27.16b
+        smull           v6.8h,   v0.8b,   v28.8b
+        smull2          v7.8h,   v0.16b,  v28.16b
+        smull           v0.8h,   v1.8b,   v29.8b
+        smull2          v1.8h,   v1.16b,  v29.16b
+        saddl           v22.4s,  v2.4h,   v4.4h
+        saddl2          v23.4s,  v2.8h,   v4.8h
+        saddl           v26.4s,  v3.4h,   v5.4h
+        saddl2          v27.4s,  v3.8h,   v5.8h
+        saddl           v2.4s,   v0.4h,   v6.4h
+        saddl2          v3.4s,   v0.8h,   v6.8h
+        saddl           v6.4s,   v1.4h,   v7.4h
+        saddl2          v7.4s,   v1.8h,   v7.8h
+        add             v4.4s,   v22.4s,  v2.4s
+        add             v5.4s,   v23.4s,  v3.4s
+        add             v6.4s,   v26.4s,  v6.4s
+        add             v7.4s,   v27.4s,  v7.4s
+
+        ext             v22.16b, v19.16b, v20.16b, #14 // top left, top mid
+        dup             v26.16b, v30.b[5]
+        ext             v23.16b, v19.16b, v20.16b, #15
+        dup             v27.16b, v30.b[6]
+        ext             v0.16b,  v20.16b, v21.16b, #1  // top mid, top right
+        dup             v28.16b, v30.b[8]
+        ext             v1.16b,  v20.16b, v21.16b, #2
+        dup             v29.16b, v30.b[9]
+
+        smull           v2.8h,   v22.8b,  v26.8b
+        smull2          v3.8h,   v22.16b, v26.16b
+        smull           v22.8h,  v23.8b,  v27.8b
+        smull2          v23.8h,  v23.16b, v27.16b
+        smull           v26.8h,  v0.8b,   v28.8b
+        smull2          v27.8h,  v0.16b,  v28.16b
+        smull           v28.8h,  v1.8b,   v29.8b
+        smull2          v29.8h,  v1.16b,  v29.16b
+        saddl           v0.4s,   v2.4h,   v22.4h
+        saddl2          v1.4s,   v2.8h,   v22.8h
+        saddl           v2.4s,   v3.4h,   v23.4h
+        saddl2          v3.4s,   v3.8h,   v23.8h
+        saddl           v22.4s,  v26.4h,  v28.4h
+        saddl2          v23.4s,  v26.8h,  v28.8h
+        saddl           v26.4s,  v27.4h,  v29.4h
+        saddl2          v27.4s,  v27.8h,  v29.8h
+        add             v0.4s,   v0.4s,   v22.4s
+        add             v1.4s,   v1.4s,   v23.4s
+        add             v2.4s,   v2.4s,   v26.4s
+        add             v3.4s,   v3.4s,   v27.4s
+        dup             v26.16b, v30.b[2]
+        dup             v27.16b, v30.b[7]
+        smull           v22.8h,  v17.8b,  v26.8b
+        smull2          v23.8h,  v17.16b, v26.16b
+        smull           v24.8h,  v20.8b,  v27.8b
+        smull2          v25.8h,  v20.16b, v27.16b
+        add             v4.4s,   v4.4s,   v0.4s
+        add             v5.4s,   v5.4s,   v1.4s
+        add             v6.4s,   v6.4s,   v2.4s
+        add             v7.4s,   v7.4s,   v3.4s
+
+        mov             v16.16b, v17.16b
+        mov             v17.16b, v18.16b
+
+        saddl           v0.4s,   v22.4h,  v24.4h
+        saddl2          v1.4s,   v22.8h,  v24.8h
+        saddl           v2.4s,   v23.4h,  v25.4h
+        saddl2          v3.4s,   v23.8h,  v25.8h
+        mov             v19.16b, v20.16b
+        mov             v20.16b, v21.16b
+        add             v4.4s,   v4.4s,   v0.4s
+        add             v5.4s,   v5.4s,   v1.4s
+        add             v6.4s,   v6.4s,   v2.4s
+        add             v7.4s,   v7.4s,   v3.4s
+        ret
+endfunc
+
+.macro sum_lag2_func type, uv_layout, edge, elems=16
+function sum_\type\()_lag2_\edge\()_neon
+        AARCH64_SIGN_LINK_REGISTER
+        str             x30, [sp, #-16]!
+.ifc \edge, left
+        sub             x12, x0,  #2*GRAIN_WIDTH
+        sub             x13, x0,  #1*GRAIN_WIDTH
+        ld1             {v17.16b}, [x12] // load the previous block right above
+        ld1             {v20.16b}, [x13]
+.endif
+        sum_lag_n_body  lag2, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=v30.b[12]
+endfunc
+.endm
+
+sum_lag2_func y,      0,   left
+sum_lag2_func y,      0,   mid
+sum_lag2_func y,      0,   right, 15
+sum_lag2_func uv_444, 444, left
+sum_lag2_func uv_444, 444, mid
+sum_lag2_func uv_444, 444, right, 15
+sum_lag2_func uv_422, 422, left
+sum_lag2_func uv_422, 422, mid
+sum_lag2_func uv_422, 422, right, 9
+sum_lag2_func uv_420, 420, left
+sum_lag2_func uv_420, 420, mid
+sum_lag2_func uv_420, 420, right, 9
+
+
+function sum_lag3_above_neon
+        sub             x11, x0,  #3*GRAIN_WIDTH - 16
+        sub             x12, x0,  #2*GRAIN_WIDTH - 16
+        sub             x13, x0,  #1*GRAIN_WIDTH - 16
+        ld1             {v15.16b}, [x11] // load top right
+        ld1             {v18.16b}, [x12]
+        ld1             {v21.16b}, [x13]
+
+        ext             v8.16b,  v13.16b, v14.16b, #13 // top left, top mid
+        dup             v22.16b, v29.b[0]
+        ext             v9.16b,  v13.16b, v14.16b, #14
+        dup             v23.16b, v29.b[1]
+        ext             v10.16b, v13.16b, v14.16b, #15
+        dup             v24.16b, v29.b[2]
+        dup             v25.16b, v29.b[3]
+        ext             v11.16b, v14.16b, v15.16b, #1  // top mid, top right
+        dup             v26.16b, v29.b[4]
+        ext             v12.16b, v14.16b, v15.16b, #2
+        dup             v27.16b, v29.b[5]
+        ext             v13.16b, v14.16b, v15.16b, #3
+        dup             v28.16b, v29.b[6]
+
+        smull           v0.8h,   v8.8b,   v22.8b
+        smull2          v1.8h,   v8.16b,  v22.16b
+        smull           v2.8h,   v9.8b,   v23.8b
+        smull2          v3.8h,   v9.16b,  v23.16b
+        smull           v8.8h,   v10.8b,  v24.8b
+        smull2          v9.8h,   v10.16b, v24.16b
+        smull           v10.8h,  v11.8b,  v26.8b
+        smull2          v11.8h,  v11.16b, v26.16b
+        saddl           v22.4s,  v0.4h,   v2.4h
+        saddl2          v23.4s,  v0.8h,   v2.8h
+        saddl           v24.4s,  v1.4h,   v3.4h
+        saddl2          v26.4s,  v1.8h,   v3.8h
+        saddl           v0.4s,   v8.4h,   v10.4h
+        saddl2          v1.4s,   v8.8h,   v10.8h
+        saddl           v2.4s,   v9.4h,   v11.4h
+        saddl2          v3.4s,   v9.8h,   v11.8h
+        smull           v8.8h,   v12.8b,  v27.8b
+        smull2          v9.8h,   v12.16b, v27.16b
+        smull           v10.8h,  v13.8b,  v28.8b
+        smull2          v11.8h,  v13.16b, v28.16b
+        smull           v12.8h,  v14.8b,  v25.8b
+        smull2          v13.8h,  v14.16b, v25.16b
+        add             v4.4s,   v22.4s,  v0.4s
+        add             v5.4s,   v23.4s,  v1.4s
+        add             v6.4s,   v24.4s,  v2.4s
+        add             v7.4s,   v26.4s,  v3.4s
+        saddl           v0.4s,   v8.4h,   v10.4h
+        saddl2          v1.4s,   v8.8h,   v10.8h
+        saddl           v2.4s,   v9.4h,   v11.4h
+        saddl2          v3.4s,   v9.8h,   v11.8h
+        add             v4.4s,   v4.4s,   v0.4s
+        add             v5.4s,   v5.4s,   v1.4s
+        add             v6.4s,   v6.4s,   v2.4s
+        add             v7.4s,   v7.4s,   v3.4s
+        saddw           v4.4s,   v4.4s,   v12.4h
+        saddw2          v5.4s,   v5.4s,   v12.8h
+        saddw           v6.4s,   v6.4s,   v13.4h
+        saddw2          v7.4s,   v7.4s,   v13.8h
+
+        ext             v8.16b,  v16.16b, v17.16b, #13 // top left, top mid
+        dup             v22.16b, v29.b[7]
+        ext             v9.16b,  v16.16b, v17.16b, #14
+        dup             v23.16b, v29.b[8]
+        ext             v10.16b, v16.16b, v17.16b, #15
+        dup             v24.16b, v29.b[9]
+        dup             v25.16b, v29.b[10]
+        ext             v11.16b, v17.16b, v18.16b, #1  // top mid, top right
+        dup             v26.16b, v29.b[11]
+        ext             v12.16b, v17.16b, v18.16b, #2
+        dup             v27.16b, v29.b[12]
+        ext             v13.16b, v17.16b, v18.16b, #3
+        dup             v28.16b, v29.b[13]
+
+        smull           v0.8h,   v8.8b,   v22.8b
+        smull2          v1.8h,   v8.16b,  v22.16b
+        smull           v2.8h,   v9.8b,   v23.8b
+        smull2          v3.8h,   v9.16b,  v23.16b
+        smull           v8.8h,   v10.8b,  v24.8b
+        smull2          v9.8h,   v10.16b, v24.16b
+        smull           v10.8h,  v11.8b,  v26.8b
+        smull2          v11.8h,  v11.16b, v26.16b
+        saddl           v22.4s,  v0.4h,   v2.4h
+        saddl2          v23.4s,  v0.8h,   v2.8h
+        saddl           v24.4s,  v1.4h,   v3.4h
+        saddl2          v26.4s,  v1.8h,   v3.8h
+        saddl           v0.4s,   v8.4h,   v10.4h
+        saddl2          v1.4s,   v8.8h,   v10.8h
+        saddl           v2.4s,   v9.4h,   v11.4h
+        saddl2          v3.4s,   v9.8h,   v11.8h
+        smull           v8.8h,   v12.8b,  v27.8b
+        smull2          v9.8h,   v12.16b, v27.16b
+        smull           v10.8h,  v13.8b,  v28.8b
+        smull2          v11.8h,  v13.16b, v28.16b
+        smull           v12.8h,  v17.8b,  v25.8b
+        smull2          v13.8h,  v17.16b, v25.16b
+        add             v22.4s,  v22.4s,  v0.4s
+        add             v23.4s,  v23.4s,  v1.4s
+        add             v24.4s,  v24.4s,  v2.4s
+        add             v26.4s,  v26.4s,  v3.4s
+        saddl           v0.4s,   v8.4h,   v10.4h
+        saddl2          v1.4s,   v8.8h,   v10.8h
+        saddl           v2.4s,   v9.4h,   v11.4h
+        saddl2          v3.4s,   v9.8h,   v11.8h
+        add             v4.4s,   v4.4s,   v22.4s
+        add             v5.4s,   v5.4s,   v23.4s
+        add             v6.4s,   v6.4s,   v24.4s
+        add             v7.4s,   v7.4s,   v26.4s
+        add             v4.4s,   v4.4s,   v0.4s
+        add             v5.4s,   v5.4s,   v1.4s
+        add             v6.4s,   v6.4s,   v2.4s
+        add             v7.4s,   v7.4s,   v3.4s
+        saddw           v4.4s,   v4.4s,   v12.4h
+        saddw2          v5.4s,   v5.4s,   v12.8h
+        saddw           v6.4s,   v6.4s,   v13.4h
+        saddw2          v7.4s,   v7.4s,   v13.8h
+
+        ext             v8.16b,  v19.16b, v20.16b, #13 // top left, top mid
+        dup             v22.16b, v29.b[14]
+        ext             v9.16b,  v19.16b, v20.16b, #14
+        dup             v23.16b, v29.b[15]
+        ext             v10.16b, v19.16b, v20.16b, #15
+        dup             v24.16b, v30.b[0]
+        dup             v25.16b, v30.b[1]
+        ext             v11.16b, v20.16b, v21.16b, #1  // top mid, top right
+        dup             v26.16b, v30.b[2]
+        ext             v12.16b, v20.16b, v21.16b, #2
+        dup             v27.16b, v30.b[3]
+        ext             v13.16b, v20.16b, v21.16b, #3
+        dup             v28.16b, v30.b[4]
+
+        smull           v0.8h,   v8.8b,   v22.8b
+        smull2          v1.8h,   v8.16b,  v22.16b
+        smull           v2.8h,   v9.8b,   v23.8b
+        smull2          v3.8h,   v9.16b,  v23.16b
+        smull           v8.8h,   v10.8b,  v24.8b
+        smull2          v9.8h,   v10.16b, v24.16b
+        smull           v10.8h,  v11.8b,  v26.8b
+        smull2          v11.8h,  v11.16b, v26.16b
+        saddl           v22.4s,  v0.4h,   v2.4h
+        saddl2          v23.4s,  v0.8h,   v2.8h
+        saddl           v24.4s,  v1.4h,   v3.4h
+        saddl2          v26.4s,  v1.8h,   v3.8h
+        saddl           v0.4s,   v8.4h,   v10.4h
+        saddl2          v1.4s,   v8.8h,   v10.8h
+        saddl           v2.4s,   v9.4h,   v11.4h
+        saddl2          v3.4s,   v9.8h,   v11.8h
+        smull           v8.8h,   v12.8b,  v27.8b
+        smull2          v9.8h,   v12.16b, v27.16b
+        smull           v10.8h,  v13.8b,  v28.8b
+        smull2          v11.8h,  v13.16b, v28.16b
+        smull           v12.8h,  v20.8b,  v25.8b
+        smull2          v19.8h,  v20.16b, v25.16b
+        add             v22.4s,  v22.4s,  v0.4s
+        add             v23.4s,  v23.4s,  v1.4s
+        add             v24.4s,  v24.4s,  v2.4s
+        add             v26.4s,  v26.4s,  v3.4s
+        saddl           v0.4s,   v8.4h,   v10.4h
+        saddl2          v1.4s,   v8.8h,   v10.8h
+        saddl           v2.4s,   v9.4h,   v11.4h
+        saddl2          v3.4s,   v9.8h,   v11.8h
+        add             v4.4s,   v4.4s,   v22.4s
+        add             v5.4s,   v5.4s,   v23.4s
+        add             v6.4s,   v6.4s,   v24.4s
+        add             v7.4s,   v7.4s,   v26.4s
+        mov             v13.16b, v14.16b
+        mov             v14.16b, v15.16b
+        add             v4.4s,   v4.4s,   v0.4s
+        add             v5.4s,   v5.4s,   v1.4s
+        add             v6.4s,   v6.4s,   v2.4s
+        add             v7.4s,   v7.4s,   v3.4s
+        mov             v16.16b, v17.16b
+        mov             v17.16b, v18.16b
+        saddw           v4.4s,   v4.4s,   v12.4h
+        saddw2          v5.4s,   v5.4s,   v12.8h
+        saddw           v6.4s,   v6.4s,   v19.4h
+        saddw2          v7.4s,   v7.4s,   v19.8h
+
+        mov             v19.16b, v20.16b
+        mov             v20.16b, v21.16b
+        ret
+endfunc
+
+.macro sum_lag3_func type, uv_layout, edge, elems=16
+function sum_\type\()_lag3_\edge\()_neon
+        AARCH64_SIGN_LINK_REGISTER
+        str             x30, [sp, #-16]!
+.ifc \edge, left
+        sub             x11, x0,  #3*GRAIN_WIDTH
+        sub             x12, x0,  #2*GRAIN_WIDTH
+        sub             x13, x0,  #1*GRAIN_WIDTH
+        ld1             {v14.16b}, [x11] // load the previous block right above
+        ld1             {v17.16b}, [x12]
+        ld1             {v20.16b}, [x13]
+.endif
+        sum_lag_n_body  lag3, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=v30.b[8]
+endfunc
+.endm
+
+sum_lag3_func y,      0,   left
+sum_lag3_func y,      0,   mid
+sum_lag3_func y,      0,   right, 15
+sum_lag3_func uv_444, 444, left
+sum_lag3_func uv_444, 444, mid
+sum_lag3_func uv_444, 444, right, 15
+sum_lag3_func uv_422, 422, left
+sum_lag3_func uv_422, 422, mid
+sum_lag3_func uv_422, 422, right, 9
+sum_lag3_func uv_420, 420, left
+sum_lag3_func uv_420, 420, mid
+sum_lag3_func uv_420, 420, right, 9
+
+function generate_grain_rows_neon
+        AARCH64_SIGN_LINK_REGISTER
+        str             x30, [sp, #-16]!
+1:
+        get_grain_row   v16, v17, v18, v19, v20, v21
+        subs            w1,  w1,  #1
+        store_grain_row v16, v17, v18, v19, v20, v21
+        b.gt            1b
+        ldr             x30, [sp], #16
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+endfunc
+
+function generate_grain_rows_44_neon
+        AARCH64_SIGN_LINK_REGISTER
+        str             x30, [sp, #-16]!
+1:
+        get_grain_row_44 v16, v17, v18
+        subs            w1,  w1,  #1
+        store_grain_row_44 v16, v17, v18
+        b.gt            1b
+        ldr             x30, [sp], #16
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+endfunc
+
+function get_grain_row_neon
+        AARCH64_SIGN_LINK_REGISTER
+        str             x30, [sp, #-16]!
+        get_grain_row   v16, v17, v18, v19, v20, v21
+        ldr             x30, [sp], #16
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+endfunc
+
+function get_grain_row_44_neon
+        AARCH64_SIGN_LINK_REGISTER
+        str             x30, [sp, #-16]!
+        get_grain_row_44 v16, v17, v18
+        ldr             x30, [sp], #16
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+endfunc
+
+function add_uv_444_coeff_lag0_neon
+add_coeff_lag0_start:
+        smull           v2.8h,   v0.8b,   v27.8b
+        smull2          v3.8h,   v0.16b,  v27.16b
+        srshl           v2.8h,   v2.8h,   v28.8h
+        srshl           v3.8h,   v3.8h,   v28.8h
+        saddw           v2.8h,   v2.8h,   v1.8b
+        saddw2          v3.8h,   v3.8h,   v1.16b
+        sqxtn           v2.8b,   v2.8h
+        sqxtn2          v2.16b,  v3.8h
+        ret
+endfunc
+
+function add_uv_420_coeff_lag0_neon
+        ld1             {v4.16b, v5.16b}, [x19], #32
+        ld1             {v6.16b, v7.16b}, [x12], #32
+        saddlp          v4.8h,   v4.16b
+        saddlp          v5.8h,   v5.16b
+        saddlp          v6.8h,   v6.16b
+        saddlp          v7.8h,   v7.16b
+        add             v4.8h,   v4.8h,   v6.8h
+        add             v5.8h,   v5.8h,   v7.8h
+        rshrn           v4.8b,   v4.8h,   #2
+        rshrn2          v4.16b,  v5.8h,   #2
+        and             v0.16b,  v4.16b,  v0.16b
+        b               add_coeff_lag0_start
+endfunc
+
+function add_uv_422_coeff_lag0_neon
+        ld1             {v4.16b, v5.16b}, [x19], #32
+        saddlp          v4.8h,   v4.16b
+        saddlp          v5.8h,   v5.16b
+        rshrn           v4.8b,   v4.8h,   #1
+        rshrn2          v4.16b,  v5.8h,   #1
+        and             v0.16b,  v4.16b,  v0.16b
+        b               add_coeff_lag0_start
+endfunc
+
+.macro gen_grain_82 type
+function generate_grain_\type\()_8bpc_neon, export=1
+        AARCH64_SIGN_LINK_REGISTER
+        stp             x30, x19, [sp, #-96]!
+
+.ifc \type, uv_444
+        mov             w13, w3
+        mov             w14, #28
+        add             x19, x1,  #3*GRAIN_WIDTH
+        mov             x1,  x2
+        mul             w13, w13, w14
+.endif
+        movrel          x3,  X(gaussian_sequence)
+        ldr             w2,  [x1, #FGD_SEED]
+        ldr             w9,  [x1, #FGD_GRAIN_SCALE_SHIFT]
+.ifc \type, y
+        add             x4,  x1,  #FGD_AR_COEFFS_Y
+.else
+        add             x4,  x1,  #FGD_AR_COEFFS_UV
+.endif
+        adr             x16, L(gen_grain_\type\()_tbl)
+        ldr             w17, [x1, #FGD_AR_COEFF_LAG]
+        add             w9,  w9,  #4
+        ldrh            w17, [x16, w17, uxtw #1]
+        dup             v31.8h,  w9    // 4 + data->grain_scale_shift
+        sub             x16, x16, w17, uxtw
+        neg             v31.8h,  v31.8h
+
+.ifc \type, uv_444
+        cmp             w13, #0
+        mov             w11, #0x49d8
+        mov             w14, #0xb524
+        add             x4,  x4,  w13, uxtw // Add offset to ar_coeffs_uv[1]
+        csel            w11, w11, w14, ne
+.endif
+
+        ldr             w7,  [x1, #FGD_AR_COEFF_SHIFT]
+        mov             w8,  #1
+        mov             w10, #1
+        lsl             w8,  w8,  w7        // 1 << ar_coeff_shift
+        lsl             w10, w10, w9        // 1 << (4 + data->grain_scale_shift)
+        lsr             w8,  w8,  #1        // 1 << (ar_coeff_shift - 1)
+        lsr             w10, w10, #1        // 1 << (4 + data->grain_scale_shift - 1)
+        mov             w5,  #127
+        mov             w6,  #-128
+
+.ifc \type, uv_444
+        eor             w2,  w2,  w11
+.endif
+
+        br              x16
+
+L(generate_grain_\type\()_lag0):
+        AARCH64_VALID_JUMP_TARGET
+.ifc \type, y
+        mov             w1,  #GRAIN_HEIGHT
+        bl              generate_grain_rows_neon
+.else
+        dup             v28.8h,  w7
+        ld1r            {v27.16b}, [x4]     // ar_coeffs_uv[0]
+        movi            v0.16b,  #0
+        movi            v1.16b,  #255
+        ext             v29.16b, v0.16b,  v1.16b,  #13
+        ext             v30.16b, v1.16b,  v0.16b,  #1
+        neg             v28.8h,  v28.8h
+
+        mov             w1,  #3
+        bl              generate_grain_rows_neon
+        mov             w1,  #GRAIN_HEIGHT-3
+1:
+        ld1             {v22.16b, v23.16b, v24.16b, v25.16b}, [x19], #64
+        bl              get_grain_row_neon
+        and             v0.16b,  v22.16b, v29.16b
+        mov             v1.16b,  v16.16b
+        bl              add_uv_444_coeff_lag0_neon
+        mov             v0.16b,  v23.16b
+        mov             v1.16b,  v17.16b
+        mov             v16.16b, v2.16b
+        bl              add_uv_444_coeff_lag0_neon
+        ld1             {v26.16b}, [x19], #16
+        mov             v0.16b,  v24.16b
+        mov             v1.16b,  v18.16b
+        mov             v17.16b, v2.16b
+        bl              add_uv_444_coeff_lag0_neon
+        add             x19, x19, #2
+        mov             v0.16b,  v25.16b
+        mov             v1.16b,  v19.16b
+        mov             v18.16b, v2.16b
+        bl              add_uv_444_coeff_lag0_neon
+        and             v0.16b,  v26.16b, v30.16b
+        mov             v1.16b,  v20.16b
+        mov             v19.16b, v2.16b
+        bl              add_uv_444_coeff_lag0_neon
+        mov             v20.16b, v2.16b
+        subs            w1,  w1,  #1
+        store_grain_row v16, v17, v18, v19, v20, v21
+        b.gt            1b
+.endif
+        ldp             x30, x19, [sp], #96
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+
+L(generate_grain_\type\()_lag1):
+        AARCH64_VALID_JUMP_TARGET
+        ld1r            {v27.16b}, [x4], #1 // ar_coeffs_y[0]
+        ld1r            {v28.16b}, [x4], #1 // ar_coeffs_y[1]
+        ld1r            {v29.16b}, [x4]     // ar_coeffs_y[2]
+.ifc \type, y
+        ldrsb           w4,  [x4, #1]       // ar_coeffs_y[3]
+.else
+        add             x4,  x4,  #2
+.endif
+
+        mov             w1,  #3
+.ifc \type, uv_444
+        ld1r            {v30.16b}, [x4]     // ar_coeffs_uv[4]
+        ldursb          w4,  [x4, #-1]      // ar_coeffs_uv[3]
+.endif
+        bl              generate_grain_rows_neon
+
+        mov             w1,  #GRAIN_HEIGHT - 3
+1:
+        sum_\type\()_lag1 v22, v16, v16, v17, left
+        sum_\type\()_lag1 v23, v16, v17, v18
+        sum_\type\()_lag1 v24, v17, v18, v19
+        sum_\type\()_lag1 v25, v18, v19, v20
+        sum_\type\()_lag1 v20, v19, v20, v21, right
+        get_grain_2     v21
+        subs            w1,  w1,  #1
+.ifc \type, uv_444
+        add             x19, x19, #2
+.endif
+        store_grain_row v22, v23, v24, v25, v20, v21
+        mov             v16.16b, v22.16b
+        mov             v17.16b, v23.16b
+        mov             v18.16b, v24.16b
+        mov             v19.16b, v25.16b
+        b.gt            1b
+
+        ldp             x30, x19, [sp], #96
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+
+L(generate_grain_\type\()_lag2):
+        AARCH64_VALID_JUMP_TARGET
+        ld1             {v30.16b}, [x4]     // ar_coeffs_y[0-11], ar_coeffs_uv[0-12]
+
+        smov            w4,  v30.b[10]
+        smov            w17, v30.b[11]
+
+        mov             w1,  #3
+        bl              generate_grain_rows_neon
+
+        mov             w1,  #GRAIN_HEIGHT - 3
+1:
+        bl              sum_\type\()_lag2_left_neon
+        bl              sum_\type\()_lag2_mid_neon
+        bl              sum_\type\()_lag2_mid_neon
+        bl              sum_\type\()_lag2_mid_neon
+        bl              sum_\type\()_lag2_right_neon
+        get_grain_2     v16
+        subs            w1,  w1,  #1
+.ifc \type, uv_444
+        add             x19, x19, #2
+.endif
+        st1             {v16.h}[0], [x0], #2
+        b.gt            1b
+
+        ldp             x30, x19, [sp], #96
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+
+L(generate_grain_\type\()_lag3):
+        AARCH64_VALID_JUMP_TARGET
+        ld1             {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
+        stp             d8,  d9,  [sp, #16]
+        stp             d10, d11, [sp, #32]
+        stp             d12, d13, [sp, #48]
+        stp             d14, d15, [sp, #64]
+        stp             x20, x21, [sp, #80]
+
+        smov            w4,  v30.b[5]
+        smov            w20, v30.b[6]
+        smov            w21, v30.b[7]
+
+        mov             w1,  #3
+        bl              generate_grain_rows_neon
+
+        mov             w1,  #GRAIN_HEIGHT - 3
+1:
+        bl              sum_\type\()_lag3_left_neon
+        bl              sum_\type\()_lag3_mid_neon
+        bl              sum_\type\()_lag3_mid_neon
+        bl              sum_\type\()_lag3_mid_neon
+        bl              sum_\type\()_lag3_right_neon
+        get_grain_2     v16
+        subs            w1,  w1,  #1
+.ifc \type, uv_444
+        add             x19, x19, #2
+.endif
+        st1             {v16.h}[0], [x0], #2
+        b.gt            1b
+
+        ldp             x20, x21, [sp, #80]
+        ldp             d14, d15, [sp, #64]
+        ldp             d12, d13, [sp, #48]
+        ldp             d10, d11, [sp, #32]
+        ldp             d8,  d9,  [sp, #16]
+        ldp             x30, x19, [sp], #96
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+
+L(gen_grain_\type\()_tbl):
+        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
+        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
+        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
+        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
+endfunc
+.endm
+
+gen_grain_82 y
+gen_grain_82 uv_444
+
+.macro set_height dst, type
+.ifc \type, uv_420
+        mov             \dst,  #SUB_GRAIN_HEIGHT-3
+.else
+        mov             \dst,  #GRAIN_HEIGHT-3
+.endif
+.endm
+
+.macro increment_y_ptr reg, type
+.ifc \type, uv_420
+        add             \reg, \reg, #2*GRAIN_WIDTH-(3*32)
+.else
+        sub             \reg, \reg, #3*32-GRAIN_WIDTH
+.endif
+.endm
+
+.macro gen_grain_44 type
+function generate_grain_\type\()_8bpc_neon, export=1
+        AARCH64_SIGN_LINK_REGISTER
+        stp             x30, x19, [sp, #-96]!
+
+        mov             w13, w3
+        mov             w14, #28
+        add             x19, x1,  #3*GRAIN_WIDTH-3
+        mov             x1,  x2
+        mul             w13, w13, w14
+
+        movrel          x3,  X(gaussian_sequence)
+        ldr             w2,  [x1, #FGD_SEED]
+        ldr             w9,  [x1, #FGD_GRAIN_SCALE_SHIFT]
+        add             x4,  x1,  #FGD_AR_COEFFS_UV
+        adr             x16, L(gen_grain_\type\()_tbl)
+        ldr             w17, [x1, #FGD_AR_COEFF_LAG]
+        add             w9,  w9,  #4
+        ldrh            w17, [x16, w17, uxtw #1]
+        dup             v31.8h,  w9    // 4 + data->grain_scale_shift
+        sub             x16, x16, w17, uxtw
+        neg             v31.8h,  v31.8h
+
+        cmp             w13, #0
+        mov             w11, #0x49d8
+        mov             w14, #0xb524
+        add             x4,  x4,  w13, uxtw // Add offset to ar_coeffs_uv[1]
+        csel            w11, w11, w14, ne
+
+        ldr             w7,  [x1, #FGD_AR_COEFF_SHIFT]
+        mov             w8,  #1
+        mov             w10, #1
+        lsl             w8,  w8,  w7        // 1 << ar_coeff_shift
+        lsl             w10, w10, w9        // 1 << (4 + data->grain_scale_shift)
+        lsr             w8,  w8,  #1        // 1 << (ar_coeff_shift - 1)
+        lsr             w10, w10, #1        // 1 << (4 + data->grain_scale_shift - 1)
+        mov             w5,  #127
+        mov             w6,  #-128
+
+        eor             w2,  w2,  w11
+
+        br              x16
+
+L(generate_grain_\type\()_lag0):
+        AARCH64_VALID_JUMP_TARGET
+        dup             v28.8h,  w7
+        ld1r            {v27.16b}, [x4]     // ar_coeffs_uv[0]
+        movi            v0.16b,  #0
+        movi            v1.16b,  #255
+        ext             v29.16b, v0.16b,  v1.16b,  #13
+        ext             v30.16b, v1.16b,  v0.16b,  #7
+        neg             v28.8h,  v28.8h
+
+        mov             w1,  #3
+        bl              generate_grain_rows_44_neon
+        set_height      w1,  \type
+1:
+        bl              get_grain_row_44_neon
+.ifc \type, uv_420
+        add             x12, x19, #GRAIN_WIDTH
+.endif
+        mov             v0.16b,  v29.16b
+        mov             v1.16b,  v16.16b
+        bl              add_\type\()_coeff_lag0_neon
+        movi            v0.16b,  #255
+        mov             v1.16b,  v17.16b
+        mov             v16.16b, v2.16b
+        bl              add_\type\()_coeff_lag0_neon
+        mov             v0.16b,  v30.16b
+        mov             v1.16b,  v18.16b
+        mov             v17.16b, v2.16b
+        bl              add_\type\()_coeff_lag0_neon
+        mov             v18.16b, v2.16b
+        subs            w1,  w1,  #1
+        increment_y_ptr x19, \type
+        store_grain_row_44 v16, v17, v18
+        b.gt            1b
+
+        ldp             x30, x19, [sp], #96
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+
+L(generate_grain_\type\()_lag1):
+        AARCH64_VALID_JUMP_TARGET
+        ld1r            {v27.16b}, [x4], #1 // ar_coeffs_uv[0]
+        ld1r            {v28.16b}, [x4], #1 // ar_coeffs_uv[1]
+        ld1r            {v29.16b}, [x4]     // ar_coeffs_uv[2]
+        add             x4,  x4,  #2
+
+        mov             w1,  #3
+        ld1r            {v30.16b}, [x4]     // ar_coeffs_u4[4]
+        ldursb          w4,  [x4, #-1]      // ar_coeffs_uv[3]
+        bl              generate_grain_rows_44_neon
+
+        set_height      w1,  \type
+1:
+        sum_\type\()_lag1 v20, v16, v16, v17, left
+        sum_\type\()_lag1 v21, v16, v17, v18
+        sum_\type\()_lag1 v18, v17, v18, v18, right
+        subs            w1,  w1,  #1
+        increment_y_ptr x19, \type
+        store_grain_row_44 v20, v21, v18
+        mov             v16.16b, v20.16b
+        mov             v17.16b, v21.16b
+        b.gt            1b
+
+        ldp             x30, x19, [sp], #96
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+
+L(generate_grain_\type\()_lag2):
+        AARCH64_VALID_JUMP_TARGET
+        ld1             {v30.16b}, [x4]     // ar_coeffs_uv[0-12]
+
+        smov            w4,  v30.b[10]
+        smov            w17, v30.b[11]
+
+        mov             w1,  #3
+        bl              generate_grain_rows_44_neon
+
+        set_height      w1,  \type
+1:
+        bl              sum_\type\()_lag2_left_neon
+        bl              sum_\type\()_lag2_mid_neon
+        bl              sum_\type\()_lag2_right_neon
+        subs            w1,  w1,  #1
+        increment_y_ptr x19, \type
+        add             x0,  x0,  #GRAIN_WIDTH-48
+        b.gt            1b
+
+        ldp             x30, x19, [sp], #96
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+
+L(generate_grain_\type\()_lag3):
+        AARCH64_VALID_JUMP_TARGET
+        ldr             q29,      [x4]      // ar_coeffs_uv[0-15]
+        ldr             q30,      [x4, #16] // ar_coeffs_uv[16-24]
+        stp             d8,  d9,  [sp, #16]
+        stp             d10, d11, [sp, #32]
+        stp             d12, d13, [sp, #48]
+        stp             d14, d15, [sp, #64]
+        stp             x20, x21, [sp, #80]
+
+        smov            w4,  v30.b[5]
+        smov            w20, v30.b[6]
+        smov            w21, v30.b[7]
+
+        mov             w1,  #3
+        bl              generate_grain_rows_44_neon
+
+        set_height      w1,  \type
+1:
+        bl              sum_\type\()_lag3_left_neon
+        bl              sum_\type\()_lag3_mid_neon
+        bl              sum_\type\()_lag3_right_neon
+        subs            w1,  w1,  #1
+        increment_y_ptr x19, \type
+        add             x0,  x0,  #GRAIN_WIDTH-48
+        b.gt            1b
+
+        ldp             x20, x21, [sp, #80]
+        ldp             d14, d15, [sp, #64]
+        ldp             d12, d13, [sp, #48]
+        ldp             d10, d11, [sp, #32]
+        ldp             d8,  d9,  [sp, #16]
+        ldp             x30, x19, [sp], #96
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+
+L(gen_grain_\type\()_tbl):
+        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
+        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
+        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
+        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
+endfunc
+.endm
+
+gen_grain_44 uv_420
+gen_grain_44 uv_422
+
+.macro gather_interleaved dst1, dst2, src1, src2, off
+        umov            w14, \src1[0+\off]
+        umov            w15, \src2[8+\off]
+        umov            w16, \src1[2+\off]
+        add             x14, x14, x3
+        umov            w17, \src2[10+\off]
+        add             x15, x15, x3
+        ld1             {\dst1}[0+\off],  [x14]
+        umov            w14, \src1[4+\off]
+        add             x16, x16, x3
+        ld1             {\dst2}[8+\off],  [x15]
+        umov            w15, \src2[12+\off]
+        add             x17, x17, x3
+        ld1             {\dst1}[2+\off],  [x16]
+        umov            w16, \src1[6+\off]
+        add             x14, x14, x3
+        ld1             {\dst2}[10+\off], [x17]
+        umov            w17, \src2[14+\off]
+        add             x15, x15, x3
+        ld1             {\dst1}[4+\off],  [x14]
+        add             x16, x16, x3
+        ld1             {\dst2}[12+\off], [x15]
+        add             x17, x17, x3
+        ld1             {\dst1}[6+\off],  [x16]
+        ld1             {\dst2}[14+\off], [x17]
+.endm
+
+.macro gather dst1, dst2, src1, src2
+        gather_interleaved \dst1, \dst2, \src1, \src2, 0
+        gather_interleaved \dst2, \dst1, \src2, \src1, 0
+        gather_interleaved \dst1, \dst2, \src1, \src2, 1
+        gather_interleaved \dst2, \dst1, \src2, \src1, 1
+.endm
+
+function gather32_neon
+        gather          v4.b, v5.b, v0.b, v1.b
+        ret
+endfunc
+
+function gather16_neon
+        gather_interleaved v4.b, v5.b, v0.b, v0.b, 0
+        gather_interleaved v4.b, v5.b, v0.b, v0.b, 1
+        ins             v4.d[1], v5.d[1]
+        ret
+endfunc
+
+const overlap_coeffs_0, align=4
+        .byte 27, 17, 0,  0,  0,  0,  0,  0
+        .byte 17, 27, 32, 32, 32, 32, 32, 32
+endconst
+
+const overlap_coeffs_1, align=4
+        .byte 23, 0,  0,  0,  0,  0,  0,  0
+        .byte 22, 32, 32, 32, 32, 32, 32, 32
+endconst
+
+.macro calc_offset offx, offy, src, sx, sy
+        and             \offy, \src,  #0xF     // randval & 0xF
+        lsr             \offx, \src,  #4       // randval >> 4
+.if \sy == 0
+        add             \offy, \offy, \offy    // 2 * (randval & 0xF)
+.endif
+.if \sx == 0
+        add             \offx, \offx, \offx    // 2 * (randval >> 4)
+.endif
+.endm
+
+.macro add_offset dst, offx, offy, src, stride
+        madd            \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
+        add             \dst, \dst, \offx, uxtw // grain_lut += offx
+.endm
+
+// void dav1d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src,
+//                                const ptrdiff_t stride,
+//                                const uint8_t scaling[SCALING_SIZE],
+//                                const int scaling_shift,
+//                                const entry grain_lut[][GRAIN_WIDTH],
+//                                const int offsets[][2],
+//                                const int h, const ptrdiff_t clip,
+//                                const ptrdiff_t type);
+function fgy_32x32_8bpc_neon, export=1
+        AARCH64_SIGN_LINK_REGISTER
+        str             x30, [sp, #-16]!
+        ldr             w11, [x6, #8]          // offsets[1][0]
+        ldr             w13, [x6, #4]          // offsets[0][1]
+        ldr             w15, [x6, #12]         // offsets[1][1]
+        ldr             w6,  [x6]              // offsets[0][0]
+        ldr             w8,  [sp, #16]         // clip
+        mov             x9,  #GRAIN_WIDTH      // grain_lut stride
+
+        neg             w4,  w4
+        dup             v29.8h,  w4            // -scaling_shift
+
+        movrel          x16, overlap_coeffs_0
+
+        cbz             w8,  1f
+        // clip
+        movi            v30.16b, #16
+        movi            v31.16b, #235
+        b               2f
+1:
+        // no clip
+        movi            v30.16b, #0
+        movi            v31.16b, #255
+2:
+
+        ld1             {v27.8b, v28.8b}, [x16] // overlap_coeffs
+
+        add             x5,  x5,  #9           // grain_lut += 9
+        add             x5,  x5,  x9,  lsl #3  // grain_lut += 8 * grain_stride
+        add             x5,  x5,  x9           // grain_lut += grain_stride
+
+        calc_offset     w11, w12, w11, 0,  0
+        calc_offset     w13, w14, w13, 0,  0
+        calc_offset     w15, w16, w15, 0,  0
+        calc_offset     w6,  w10, w6,  0,  0
+
+        add_offset      x12, w11, x12, x5,  x9
+        add_offset      x14, w13, x14, x5,  x9
+        add_offset      x16, w15, x16, x5,  x9
+        add_offset      x5,  w6,  x10, x5,  x9
+
+        ldr             w11, [sp, #24]         // type
+        adr             x13, L(fgy_loop_tbl)
+
+        add             x4,  x12, #32          // grain_lut += BLOCK_SIZE * bx
+        add             x6,  x14, x9,  lsl #5  // grain_lut += grain_stride * BLOCK_SIZE * by
+
+        tst             w11, #1
+        ldrh            w11, [x13, w11, uxtw #1]
+
+        add             x8,  x16, x9,  lsl #5  // grain_lut += grain_stride * BLOCK_SIZE * by
+        add             x8,  x8,  #32          // grain_lut += BLOCK_SIZE * bx
+
+        sub             x11, x13, w11, uxtw
+
+        b.eq            1f
+        // y overlap
+        dup             v6.16b,  v27.b[0]
+        dup             v7.16b,  v27.b[1]
+        mov             w10, w7                // backup actual h
+        mov             w7,  #2
+1:
+        br              x11
+endfunc
+
+function fgy_loop_neon
+.macro fgy ox, oy
+L(loop_\ox\oy):
+        AARCH64_VALID_JUMP_TARGET
+1:
+        ld1             {v0.16b,  v1.16b},  [x1],  x2 // src
+.if \ox
+        ld1             {v20.8b},           [x4],  x9 // grain_lut old
+.endif
+.if \oy
+        ld1             {v22.16b, v23.16b}, [x6],  x9 // grain_lut top
+.endif
+.if \ox && \oy
+        ld1             {v21.8b},           [x8],  x9 // grain_lut top old
+.endif
+        ld1             {v18.16b, v19.16b}, [x5],  x9 // grain_lut
+
+        bl              gather32_neon
+
+.if \ox
+        smull           v20.8h,  v20.8b,  v27.8b
+        smlal           v20.8h,  v18.8b,  v28.8b
+.endif
+
+.if \oy
+.if \ox
+        smull           v21.8h,  v21.8b,  v27.8b
+        smlal           v21.8h,  v22.8b,  v28.8b
+        sqrshrn         v20.8b,  v20.8h,  #5
+        sqrshrn         v21.8b,  v21.8h,  #5
+.endif
+
+.if \ox
+        smull           v16.8h,  v20.8b,  v7.8b
+.else
+        smull           v16.8h,  v18.8b,  v7.8b
+.endif
+        smull2          v17.8h,  v18.16b, v7.16b
+        smull           v18.8h,  v19.8b,  v7.8b
+        smull2          v19.8h,  v19.16b, v7.16b
+.if \ox
+        smlal           v16.8h,  v21.8b,  v6.8b
+.else
+        smlal           v16.8h,  v22.8b,  v6.8b
+.endif
+        smlal2          v17.8h,  v22.16b, v6.16b
+        smlal           v18.8h,  v23.8b,  v6.8b
+        smlal2          v19.8h,  v23.16b, v6.16b
+        sqrshrn         v22.8b,  v16.8h,  #5
+        sqrshrn2        v22.16b, v17.8h,  #5
+        sqrshrn         v23.8b,  v18.8h,  #5
+        sqrshrn2        v23.16b, v19.8h,  #5
+.endif
+
+        // sxtl of grain
+.if \oy
+        sxtl            v16.8h,  v22.8b
+        sxtl2           v17.8h,  v22.16b
+        sxtl            v18.8h,  v23.8b
+        sxtl2           v19.8h,  v23.16b
+.elseif \ox
+        sqrshrn         v20.8b,  v20.8h,  #5
+        sxtl2           v17.8h,  v18.16b
+        sxtl            v18.8h,  v19.8b
+        sxtl2           v19.8h,  v19.16b
+        sxtl            v16.8h,  v20.8b
+.else
+        sxtl            v16.8h,  v18.8b
+        sxtl2           v17.8h,  v18.16b
+        sxtl            v18.8h,  v19.8b
+        sxtl2           v19.8h,  v19.16b
+.endif
+
+        uxtl            v2.8h,   v4.8b   // scaling
+        uxtl2           v3.8h,   v4.16b
+        uxtl            v4.8h,   v5.8b
+        uxtl2           v5.8h,   v5.16b
+
+        mul             v16.8h,  v16.8h,  v2.8h   // scaling * grain
+        mul             v17.8h,  v17.8h,  v3.8h
+        mul             v18.8h,  v18.8h,  v4.8h
+        mul             v19.8h,  v19.8h,  v5.8h
+
+        srshl           v16.8h,  v16.8h,  v29.8h  // round2(scaling * grain, scaling_shift)
+        srshl           v17.8h,  v17.8h,  v29.8h
+        srshl           v18.8h,  v18.8h,  v29.8h
+        srshl           v19.8h,  v19.8h,  v29.8h
+
+        uaddw           v16.8h,  v16.8h,  v0.8b   // *src + noise
+        uaddw2          v17.8h,  v17.8h,  v0.16b
+        uaddw           v18.8h,  v18.8h,  v1.8b
+        uaddw2          v19.8h,  v19.8h,  v1.16b
+
+        sqxtun          v0.8b,   v16.8h
+        sqxtun2         v0.16b,  v17.8h
+        sqxtun          v1.8b,   v18.8h
+        sqxtun2         v1.16b,  v19.8h
+
+        umax            v0.16b,  v0.16b,  v30.16b
+        umax            v1.16b,  v1.16b,  v30.16b
+        umin            v0.16b,  v0.16b,  v31.16b
+        umin            v1.16b,  v1.16b,  v31.16b
+
+        subs            w7,  w7,  #1
+.if \oy
+        dup             v6.16b,  v28.b[0]
+        dup             v7.16b,  v28.b[1]
+.endif
+        st1             {v0.16b,  v1.16b},  [x0], x2 // dst
+        b.gt            1b
+
+.if \oy
+        cmp             w10, #2
+        sub             w7,  w10, #2           // restore actual remaining h
+        b.gt            L(loop_\ox\()0)
+.endif
+        ldr             x30, [sp], #16
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+.endm
+
+        fgy             0, 0
+        fgy             0, 1
+        fgy             1, 0
+        fgy             1, 1
+
+L(fgy_loop_tbl):
+        .hword L(fgy_loop_tbl) - L(loop_00)
+        .hword L(fgy_loop_tbl) - L(loop_01)
+        .hword L(fgy_loop_tbl) - L(loop_10)
+        .hword L(fgy_loop_tbl) - L(loop_11)
+endfunc
+
+// void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst,
+//                                     const pixel *const src,
+//                                     const ptrdiff_t stride,
+//                                     const uint8_t scaling[SCALING_SIZE],
+//                                     const Dav1dFilmGrainData *const data,
+//                                     const entry grain_lut[][GRAIN_WIDTH],
+//                                     const pixel *const luma_row,
+//                                     const ptrdiff_t luma_stride,
+//                                     const int offsets[][2],
+//                                     const ptrdiff_t h, const ptrdiff_t uv,
+//                                     const ptrdiff_t is_id,
+//                                     const ptrdiff_t type);
+.macro fguv layout, sx, sy
+function fguv_32x32_\layout\()_8bpc_neon, export=1
+        AARCH64_SIGN_LINK_REGISTER
+        str             x30,      [sp, #-32]!
+        str             d8,       [sp, #16]
+        ldp             x8,  x9,  [sp, #32]    // offsets, h
+        ldp             x10, x11, [sp, #48]    // uv, is_id
+
+        ldr             w13, [x4, #FGD_SCALING_SHIFT]
+        ldr             w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE]
+        neg             w13, w13               // -scaling_shift
+
+        // !csfl
+        add             x10, x4,  x10, lsl #2  // + 4*uv
+        add             x14, x10, #FGD_UV_LUMA_MULT
+        add             x15, x10, #FGD_UV_MULT
+        add             x10, x10, #FGD_UV_OFFSET
+        ld1             {v8.h}[0], [x14]       // uv_luma_mult
+        ld1r            {v24.8h},  [x10]       // uv_offset
+        ld1             {v8.h}[1], [x15]       // uv_mult
+
+        dup             v29.8h,  w13           // -scaling_shift
+
+        cbz             w12, 1f
+        // clip
+        movi            v30.16b, #16
+        movi            v31.16b, #240
+        cbz             w11, 2f
+        // is_id
+        movi            v31.16b, #235
+        b               2f
+1:
+        // no clip
+        movi            v30.16b, #0
+        movi            v31.16b, #255
+2:
+
+        ldr             w12, [x8, #8]          // offsets[1][0]
+        ldr             w14, [x8, #4]          // offsets[0][1]
+        ldr             w16, [x8, #12]         // offsets[1][1]
+        ldr             w8,  [x8]              // offsets[0][0]
+
+        mov             x10, #GRAIN_WIDTH      // grain_lut stride
+
+        add             x5,  x5,  #(3 + (2 >> \sx)*3) // grain_lut += 9 or 6
+.if \sy
+        add             x5,  x5,  x10, lsl #2  // grain_lut += 4 * grain_stride
+        add             x5,  x5,  x10, lsl #1  // grain_lut += 2 * grain_stride
+.else
+        add             x5,  x5,  x10, lsl #3  // grain_lut += 8 * grain_stride
+        add             x5,  x5,  x10          // grain_lut += grain_stride
+.endif
+
+        calc_offset     w12, w13, w12, \sx, \sy
+        calc_offset     w14, w15, w14, \sx, \sy
+        calc_offset     w16, w17, w16, \sx, \sy
+        calc_offset     w8,  w11, w8,  \sx, \sy
+
+        add_offset      x13, w12, x13, x5,  x10
+        add_offset      x15, w14, x15, x5,  x10
+        add_offset      x17, w16, x17, x5,  x10
+        add_offset      x5,  w8,  x11, x5,  x10
+
+        add             x4,  x13, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
+        add             x8,  x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
+        add             x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
+        add             x11, x11, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
+
+        ldr             w13, [sp, #64]         // type
+
+        movrel          x16, overlap_coeffs_\sx
+        adr             x14, L(fguv_loop_sx\sx\()_tbl)
+
+        ld1             {v27.8b, v28.8b}, [x16] // overlap_coeffs
+        tst             w13, #1
+        ldrh            w13, [x14, w13, uxtw #1]
+
+        b.eq            1f
+        // y overlap
+        sub             w12, w9,  #(2 >> \sy)  // backup remaining h
+        mov             w9,  #(2 >> \sy)
+
+1:
+        sub             x13, x14, w13, uxtw
+
+.if \sy
+        movi            v25.16b, #23
+        movi            v26.16b, #22
+.else
+        movi            v25.16b, #27
+        movi            v26.16b, #17
+.endif
+
+.if \sy
+        add             x7,  x7,  x7           // luma_stride *= 2
+.endif
+
+        br              x13
+endfunc
+.endm
+
+fguv 420, 1, 1
+fguv 422, 1, 0
+fguv 444, 0, 0
+
+function fguv_loop_sx0_neon
+.macro fguv_loop_sx0 csfl, ox, oy
+L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
+        AARCH64_VALID_JUMP_TARGET
+1:
+        ld1             {v0.16b,  v1.16b},  [x6],  x7  // luma
+        ld1             {v6.16b,  v7.16b},  [x1],  x2  // src
+.if \ox
+        ld1             {v20.8b},           [x4],  x10 // grain_lut old
+.endif
+.if \oy
+        ld1             {v22.16b, v23.16b}, [x8],  x10 // grain_lut top
+.endif
+.if \ox && \oy
+        ld1             {v21.8b},           [x11], x10 // grain_lut top old
+.endif
+        ld1             {v18.16b, v19.16b}, [x5],  x10 // grain_lut
+
+.if !\csfl
+        uxtl            v2.8h,   v0.8b
+        uxtl2           v3.8h,   v0.16b
+        uxtl            v4.8h,   v1.8b
+        uxtl2           v5.8h,   v1.16b
+        uxtl            v0.8h,   v6.8b
+        uxtl2           v1.8h,   v6.16b
+        uxtl            v16.8h,  v7.8b
+        uxtl2           v17.8h,  v7.16b
+        mul             v2.8h,   v2.8h,   v8.h[0]
+        mul             v3.8h,   v3.8h,   v8.h[0]
+        mul             v4.8h,   v4.8h,   v8.h[0]
+        mul             v5.8h,   v5.8h,   v8.h[0]
+        mul             v0.8h,   v0.8h,   v8.h[1]
+        mul             v1.8h,   v1.8h,   v8.h[1]
+        mul             v16.8h,  v16.8h,  v8.h[1]
+        mul             v17.8h,  v17.8h,  v8.h[1]
+        sqadd           v2.8h,   v2.8h,   v0.8h
+        sqadd           v3.8h,   v3.8h,   v1.8h
+        sqadd           v4.8h,   v4.8h,   v16.8h
+        sqadd           v5.8h,   v5.8h,   v17.8h
+        sshr            v2.8h,   v2.8h,   #6
+        sshr            v3.8h,   v3.8h,   #6
+        sshr            v4.8h,   v4.8h,   #6
+        sshr            v5.8h,   v5.8h,   #6
+        add             v2.8h,   v2.8h,   v24.8h
+        add             v3.8h,   v3.8h,   v24.8h
+        add             v4.8h,   v4.8h,   v24.8h
+        add             v5.8h,   v5.8h,   v24.8h
+        sqxtun          v0.8b,   v2.8h
+        sqxtun2         v0.16b,  v3.8h
+        sqxtun          v1.8b,   v4.8h
+        sqxtun2         v1.16b,  v5.8h
+.endif
+
+        bl              gather32_neon
+
+.if \ox
+        smull           v20.8h,  v20.8b,  v27.8b
+        smlal           v20.8h,  v18.8b,  v28.8b
+.endif
+
+.if \oy
+.if \ox
+        smull           v21.8h,  v21.8b,  v27.8b
+        smlal           v21.8h,  v22.8b,  v28.8b
+        sqrshrn         v20.8b,  v20.8h,  #5
+        sqrshrn         v21.8b,  v21.8h,  #5
+.endif
+
+.if \ox
+        smull           v16.8h,  v20.8b,  v26.8b
+.else
+        smull           v16.8h,  v18.8b,  v26.8b
+.endif
+        smull2          v17.8h,  v18.16b, v26.16b
+        smull           v18.8h,  v19.8b,  v26.8b
+        smull2          v19.8h,  v19.16b, v26.16b
+.if \ox
+        smlal           v16.8h,  v21.8b,  v25.8b
+.else
+        smlal           v16.8h,  v22.8b,  v25.8b
+.endif
+        smlal2          v17.8h,  v22.16b, v25.16b
+        smlal           v18.8h,  v23.8b,  v25.8b
+        smlal2          v19.8h,  v23.16b, v25.16b
+        sqrshrn         v22.8b,  v16.8h,  #5
+        sqrshrn2        v22.16b, v17.8h,  #5
+        sqrshrn         v23.8b,  v18.8h,  #5
+        sqrshrn2        v23.16b, v19.8h,  #5
+.endif
+
+        // sxtl of grain
+.if \oy
+        sxtl            v16.8h,  v22.8b
+        sxtl2           v17.8h,  v22.16b
+        sxtl            v18.8h,  v23.8b
+        sxtl2           v19.8h,  v23.16b
+.elseif \ox
+        sqrshrn         v20.8b,  v20.8h,  #5
+        sxtl2           v17.8h,  v18.16b
+        sxtl            v18.8h,  v19.8b
+        sxtl2           v19.8h,  v19.16b
+        sxtl            v16.8h,  v20.8b
+.else
+        sxtl            v16.8h,  v18.8b
+        sxtl2           v17.8h,  v18.16b
+        sxtl            v18.8h,  v19.8b
+        sxtl2           v19.8h,  v19.16b
+.endif
+
+        uxtl            v2.8h,   v4.8b   // scaling
+        uxtl2           v3.8h,   v4.16b
+        uxtl            v4.8h,   v5.8b
+        uxtl2           v5.8h,   v5.16b
+
+        mul             v16.8h,  v16.8h,  v2.8h   // scaling * grain
+        mul             v17.8h,  v17.8h,  v3.8h
+        mul             v18.8h,  v18.8h,  v4.8h
+        mul             v19.8h,  v19.8h,  v5.8h
+
+        srshl           v16.8h,  v16.8h,  v29.8h  // round2(scaling * grain, scaling_shift)
+        srshl           v17.8h,  v17.8h,  v29.8h
+        srshl           v18.8h,  v18.8h,  v29.8h
+        srshl           v19.8h,  v19.8h,  v29.8h
+
+        uaddw           v16.8h,  v16.8h,  v6.8b   // *src + noise
+        uaddw2          v17.8h,  v17.8h,  v6.16b
+        uaddw           v18.8h,  v18.8h,  v7.8b
+        uaddw2          v19.8h,  v19.8h,  v7.16b
+
+        sqxtun          v0.8b,   v16.8h
+        sqxtun2         v0.16b,  v17.8h
+        sqxtun          v1.8b,   v18.8h
+        sqxtun2         v1.16b,  v19.8h
+
+        umax            v0.16b,  v0.16b,  v30.16b
+        umax            v1.16b,  v1.16b,  v30.16b
+        umin            v0.16b,  v0.16b,  v31.16b
+        umin            v1.16b,  v1.16b,  v31.16b
+
+        subs            w9,  w9,  #1
+.if \oy
+        dup             v25.16b, v28.b[0]
+        dup             v26.16b, v28.b[1]
+.endif
+        st1             {v0.16b,  v1.16b},  [x0], x2 // dst
+        b.gt            1b
+
+.if \oy
+        cmp             w12, #0
+        mov             w9,  w12               // restore actual remaining h
+        b.gt            L(fguv_loop_sx0_csfl\csfl\()_\ox\()0)
+.endif
+        b               9f
+.endm
+        fguv_loop_sx0   0, 0, 0
+        fguv_loop_sx0   0, 0, 1
+        fguv_loop_sx0   0, 1, 0
+        fguv_loop_sx0   0, 1, 1
+        fguv_loop_sx0   1, 0, 0
+        fguv_loop_sx0   1, 0, 1
+        fguv_loop_sx0   1, 1, 0
+        fguv_loop_sx0   1, 1, 1
+
+9:
+        ldr             d8,       [sp, #16]
+        ldr             x30,      [sp], #32
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+
+L(fguv_loop_sx0_tbl):
+        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00)
+        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01)
+        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10)
+        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11)
+        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00)
+        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01)
+        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10)
+        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11)
+endfunc
+
+function fguv_loop_sx1_neon
+.macro fguv_loop_sx1 csfl, ox, oy
+L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
+        AARCH64_VALID_JUMP_TARGET
+1:
+        ld1             {v0.16b, v1.16b},  [x6],  x7  // luma
+        ld1             {v6.16b},          [x1],  x2  // src
+.if \ox
+        ld1             {v20.8b},          [x4],  x10 // grain_lut old
+.endif
+.if \oy
+        ld1             {v22.16b},         [x8],  x10 // grain_lut top
+.endif
+.if \ox && \oy
+        ld1             {v21.8b},          [x11], x10 // grain_lut top old
+.endif
+        ld1             {v18.16b},         [x5],  x10 // grain_lut
+
+        uaddlp          v2.8h,   v0.16b
+        uaddlp          v3.8h,   v1.16b
+.if \csfl
+        rshrn           v0.8b,   v2.8h,   #1
+        rshrn2          v0.16b,  v3.8h,   #1
+.else
+        urshr           v2.8h,   v2.8h,   #1
+        urshr           v3.8h,   v3.8h,   #1
+        uxtl            v0.8h,   v6.8b
+        uxtl2           v1.8h,   v6.16b
+        mul             v2.8h,   v2.8h,   v8.h[0]
+        mul             v3.8h,   v3.8h,   v8.h[0]
+        mul             v0.8h,   v0.8h,   v8.h[1]
+        mul             v1.8h,   v1.8h,   v8.h[1]
+        sqadd           v2.8h,   v2.8h,   v0.8h
+        sqadd           v3.8h,   v3.8h,   v1.8h
+        sshr            v2.8h,   v2.8h,   #6
+        sshr            v3.8h,   v3.8h,   #6
+        add             v2.8h,   v2.8h,   v24.8h
+        add             v3.8h,   v3.8h,   v24.8h
+        sqxtun          v0.8b,   v2.8h
+        sqxtun2         v0.16b,  v3.8h
+.endif
+
+        bl              gather16_neon
+
+.if \ox
+        smull           v20.8h,  v20.8b,  v27.8b
+        smlal           v20.8h,  v18.8b,  v28.8b
+.endif
+
+.if \oy
+.if \ox
+        smull           v21.8h,  v21.8b,  v27.8b
+        smlal           v21.8h,  v22.8b,  v28.8b
+        sqrshrn         v20.8b,  v20.8h,  #5
+        sqrshrn         v21.8b,  v21.8h,  #5
+.endif
+
+.if \ox
+        smull           v16.8h,  v20.8b,  v26.8b
+.else
+        smull           v16.8h,  v18.8b,  v26.8b
+.endif
+        smull2          v17.8h,  v18.16b, v26.16b
+.if \ox
+        smlal           v16.8h,  v21.8b,  v25.8b
+.else
+        smlal           v16.8h,  v22.8b,  v25.8b
+.endif
+        smlal2          v17.8h,  v22.16b, v25.16b
+        sqrshrn         v22.8b,  v16.8h,  #5
+        sqrshrn2        v22.16b, v17.8h,  #5
+.endif
+
+        // sxtl of grain
+.if \oy
+        sxtl            v16.8h,  v22.8b
+        sxtl2           v17.8h,  v22.16b
+.elseif \ox
+        sqrshrn         v20.8b,  v20.8h,  #5
+        sxtl2           v17.8h,  v18.16b
+        sxtl            v16.8h,  v20.8b
+.else
+        sxtl            v16.8h,  v18.8b
+        sxtl2           v17.8h,  v18.16b
+.endif
+
+        uxtl            v2.8h,   v4.8b   // scaling
+        uxtl2           v3.8h,   v4.16b
+
+        mul             v16.8h,  v16.8h,  v2.8h   // scaling * grain
+        mul             v17.8h,  v17.8h,  v3.8h
+
+        srshl           v16.8h,  v16.8h,  v29.8h  // round2(scaling * grain, scaling_shift)
+        srshl           v17.8h,  v17.8h,  v29.8h
+
+        uaddw           v16.8h,  v16.8h,  v6.8b   // *src + noise
+        uaddw2          v17.8h,  v17.8h,  v6.16b
+
+        sqxtun          v0.8b,   v16.8h
+        sqxtun2         v0.16b,  v17.8h
+
+        umax            v0.16b,  v0.16b,  v30.16b
+        umin            v0.16b,  v0.16b,  v31.16b
+
+.if \oy
+        mov             v16.16b, v25.16b
+.endif
+        subs            w9,  w9,  #1
+.if \oy
+        mov             v25.16b, v26.16b
+        mov             v26.16b, v16.16b
+.endif
+        st1             {v0.16b},  [x0], x2 // dst
+        b.gt            1b
+
+.if \oy
+        cmp             w12, #0
+        mov             w9,  w12               // restore actual remaining h
+        b.gt            L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
+.endif
+
+        b               9f
+.endm
+        fguv_loop_sx1   0, 0, 0
+        fguv_loop_sx1   0, 0, 1
+        fguv_loop_sx1   0, 1, 0
+        fguv_loop_sx1   0, 1, 1
+        fguv_loop_sx1   1, 0, 0
+        fguv_loop_sx1   1, 0, 1
+        fguv_loop_sx1   1, 1, 0
+        fguv_loop_sx1   1, 1, 1
+
+9:
+        ldr             d8,       [sp, #16]
+        ldr             x30,      [sp], #32
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+
+L(fguv_loop_sx1_tbl):
+        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00)
+        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01)
+        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10)
+        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11)
+        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00)
+        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01)
+        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10)
+        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11)
+endfunc
diff --git a/third_party/dav1d/src/arm/64/filmgrain16.S b/third_party/dav1d/src/arm/64/filmgrain16.S
new file mode 100644
index 0000000000000..7c4ff6dda9435
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/filmgrain16.S
@@ -0,0 +1,1997 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "src/arm/asm-offsets.h"
+
+#define GRAIN_WIDTH 82
+#define GRAIN_HEIGHT 73
+
+#define SUB_GRAIN_WIDTH 44
+#define SUB_GRAIN_HEIGHT 38
+
+.macro increment_seed steps, shift=1
+        lsr             w11, w2,  #3
+        lsr             w12, w2,  #12
+        lsr             w13, w2,  #1
+        eor             w11, w2,  w11                     // (r >> 0) ^ (r >> 3)
+        eor             w12, w12, w13                     // (r >> 12) ^ (r >> 1)
+        eor             w11, w11, w12                     // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1)
+.if \shift
+        lsr             w2,  w2,  #\steps
+.endif
+        and             w11, w11, #((1 << \steps) - 1)    // bit
+.if \shift
+        orr             w2,  w2,  w11, lsl #(16 - \steps) // *state
+.else
+        orr             w2,  w2,  w11, lsl #16            // *state
+.endif
+.endm
+
+.macro read_rand dest, bits, age
+        ubfx            \dest,  x2,   #16 - \bits - \age, #\bits
+.endm
+
+.macro read_shift_rand dest, bits
+        ubfx            \dest,  x2,   #17 - \bits, #\bits
+        lsr             w2,  w2,  #1
+.endm
+
+// special calling convention:
+// w2 holds seed
+// x3 holds dav1d_gaussian_sequence
+// clobbers x11-x15
+// returns in v0.8h
+function get_gaussian_neon
+        increment_seed  4
+        read_rand       x14, 11,  3
+        read_rand       x15, 11,  2
+        add             x14, x3,  x14, lsl #1
+        add             x15, x3,  x15, lsl #1
+        ld1             {v0.h}[0], [x14]
+        read_rand       x14, 11,  1
+        ld1             {v0.h}[1], [x15]
+        add             x14, x3,  x14, lsl #1
+        read_rand       x15, 11,  0
+        increment_seed  4
+        add             x15, x3,  x15, lsl #1
+        ld1             {v0.h}[2], [x14]
+        read_rand       x14, 11,  3
+        ld1             {v0.h}[3], [x15]
+        add             x14, x3,  x14, lsl #1
+        read_rand       x15, 11,  2
+        ld1             {v0.h}[4], [x14]
+        add             x15, x3,  x15, lsl #1
+        read_rand       x14, 11,  1
+        ld1             {v0.h}[5], [x15]
+        read_rand       x15, 11,  0
+        add             x14, x3,  x14, lsl #1
+        add             x15, x3,  x15, lsl #1
+        ld1             {v0.h}[6], [x14]
+        ld1             {v0.h}[7], [x15]
+        ret
+endfunc
+
+.macro store_grain_row r0, r1, r2, r3, r4, r5
+        st1             {\r0\().16b,\r1\().16b}, [x0], #32
+        st1             {\r2\().16b,\r3\().16b}, [x0], #32
+        st1             {\r4\().16b},  [x0], #16
+        st1             {\r5\().h}[0], [x0], #2
+.endm
+
+function get_grain_2_neon
+        increment_seed  2
+        read_rand       x14, 11,  1
+        read_rand       x15, 11,  0
+        add             x14, x3,  x14, lsl #1
+        add             x15, x3,  x15, lsl #1
+        ld1             {v0.h}[0], [x14]
+        ld1             {v0.h}[1], [x15]
+        srshl           v0.4h,   v0.4h,   v31.4h
+        ret
+endfunc
+
+.macro get_grain_2 dst
+        bl              get_grain_2_neon
+.ifnc \dst, v0
+        mov             \dst\().8b, v0.8b
+.endif
+.endm
+
+function get_grain_4_neon
+        increment_seed  4
+        read_rand       x14, 11,  3
+        read_rand       x15, 11,  2
+        add             x14, x3,  x14, lsl #1
+        add             x15, x3,  x15, lsl #1
+        ld1             {v0.h}[0], [x14]
+        read_rand       x14, 11,  1
+        ld1             {v0.h}[1], [x15]
+        add             x14, x3,  x14, lsl #1
+        read_rand       x15, 11,  0
+        add             x15, x3,  x15, lsl #1
+        ld1             {v0.h}[2], [x14]
+        ld1             {v0.h}[3], [x15]
+        srshl           v0.4h,   v0.4h,   v31.4h
+        ret
+endfunc
+
+.macro get_grain_4 dst
+        bl              get_grain_4_neon
+.ifnc \dst, v0
+        mov             \dst\().8b, v0.8b
+.endif
+.endm
+
+// w15 holds the number of entries to produce
+// w14, w16 and w17 hold the previous output entries
+// v0 holds the vector of produced entries
+// v1 holds the input vector of sums from above
+.macro output_lag n
+function output_lag\n\()_neon
+1:
+        read_shift_rand x13, 11
+        mov             w11, v1.s[0]
+        ldrsh           w12, [x3, x13, lsl #1]
+        ext             v0.16b,  v0.16b,  v0.16b,  #2
+.if \n == 1
+        madd            w11, w14, w4,  w11        // sum (above) + *coeff * prev output
+.elseif \n == 2
+        madd            w11, w16, w4,  w11        // sum (above) + *coeff * prev output 1
+        madd            w11, w14, w17, w11        // += *coeff * prev output 2
+        mov             w16, w14
+.else
+        madd            w11, w17, w4,  w11        // sum (above) + *coeff * prev output 1
+        madd            w11, w16, w20, w11        // sum (above) + *coeff * prev output 2
+        madd            w11, w14, w21, w11        // += *coeff * prev output 3
+        mov             w17, w16
+        mov             w16, w14
+.endif
+        add             w14, w11, w8              // 1 << (ar_coeff_shift - 1)
+        add             w12, w12, w10             // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1)
+        asr             w14, w14, w7              // >> ar_coeff_shift
+        asr             w12, w12, w9              // >> (4 - bitdepth_min_8 + grain_scale_shift)
+        add             w14, w14, w12
+        cmp             w14, w5
+        csel            w14, w14, w5,  le
+        cmp             w14, w6
+        csel            w14, w14, w6,  ge
+        subs            w15, w15, #1
+        ext             v1.16b,  v1.16b,  v1.16b,  #4
+        ins             v0.h[7], w14
+        b.gt            1b
+        ret
+endfunc
+.endm
+
+output_lag 1
+output_lag 2
+output_lag 3
+
+
+function sum_lag1_above_neon
+        sub             x12, x0,  #1*GRAIN_WIDTH*2 - 16
+        ld1             {v18.8h}, [x12] // load top right
+
+        ext             v0.16b,  v16.16b, v17.16b, #14 // top left, top mid
+        ext             v1.16b,  v17.16b, v18.16b, #2  // top mid, top right
+
+        smull           v4.4s,   v17.4h,  v28.4h
+        smlal           v4.4s,   v0.4h,   v27.4h
+        smlal           v4.4s,   v1.4h,   v29.4h
+        smull2          v5.4s,   v17.8h,  v28.8h
+        smlal2          v5.4s,   v0.8h,   v27.8h
+        smlal2          v5.4s,   v1.8h,   v29.8h
+
+        mov             v16.16b, v17.16b
+        mov             v17.16b, v18.16b
+
+        ret
+endfunc
+
+.macro sum_lag_n_body lag, type, uv_layout, edge, elems, uv_coeff
+        bl              sum_\lag\()_above_neon
+.ifc \type, uv_420
+        add             x12, x19, #GRAIN_WIDTH*2
+        ld1             {v22.8h, v23.8h}, [x19], #32
+        ld1             {v24.8h, v25.8h}, [x12]
+        addp            v22.8h,  v22.8h,  v23.8h
+        addp            v23.8h,  v24.8h,  v25.8h
+        add             v22.8h,  v22.8h,  v23.8h
+        srshr           v0.8h,   v22.8h,  #2
+.endif
+.ifc \type, uv_422
+        ld1             {v22.8h, v23.8h}, [x19], #32
+        addp            v22.8h,  v22.8h,  v23.8h
+        srshr           v0.8h,   v22.8h,  #1
+.endif
+.ifc \type, uv_444
+        ld1             {v0.8h}, [x19], #16
+.endif
+.if \uv_layout
+.ifnb \uv_coeff
+        dup             v1.8b,   \uv_coeff
+        sxtl            v1.8h,   v1.8b
+        smlal           v4.4s,   v0.4h,   v1.4h
+        smlal2          v5.4s,   v0.8h,   v1.8h
+.else
+        smlal           v4.4s,   v0.4h,   v30.4h
+        smlal2          v5.4s,   v0.8h,   v30.8h
+.endif
+.endif
+.if \uv_layout && \elems == 8
+        b               sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 444 && \elems == 7
+        b               sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 422 && \elems == 1
+        b               sum_\lag\()_uv_420_\edge\()_start
+.else
+sum_\lag\()_\type\()_\edge\()_start:
+.if \elems > 4
+.ifc \edge, left
+        increment_seed  4
+        read_rand       x12, 11,  3
+        read_rand       x13, 11,  2
+        read_rand       x14, 11,  1
+        add             x12, x3,  x12, lsl #1
+        add             x13, x3,  x13, lsl #1
+        add             x14, x3,  x14, lsl #1
+        ld1             {v0.h}[5], [x12]
+        ld1             {v0.h}[6], [x13]
+        ld1             {v0.h}[7], [x14]
+        lsl             x2,  x2,  #1             // shift back the state as if we'd done increment_seed with shift=0
+        srshl           v0.8h,   v0.8h,   v31.8h
+        ext             v4.16b,  v4.16b,  v4.16b,  #12
+.ifc \lag, lag3
+        smov            w17, v0.h[5]
+.endif
+.ifnc \lag, lag1
+        smov            w16, v0.h[6]
+.endif
+        smov            w14, v0.h[7]
+
+        mov             v1.16b,  v4.16b
+        mov             w15, #1
+        bl              output_\lag\()_neon
+.else
+        increment_seed  4, shift=0
+        mov             v1.16b,  v4.16b
+        mov             w15, #4
+        bl              output_\lag\()_neon
+.endif
+
+        increment_seed  4, shift=0
+        mov             v1.16b,  v5.16b
+.ifc \edge, right
+        mov             w15, #3
+        bl              output_\lag\()_neon
+        read_shift_rand x15, 11
+        add             x15, x3,  x15, lsl #1
+        ld1             {v1.h}[0], [x15]
+        srshl           v1.4h,   v1.4h,   v31.4h
+        ext             v0.16b,  v0.16b,  v1.16b,  #2
+.else
+        mov             w15, #4
+        bl              output_\lag\()_neon
+.endif
+.else
+        // elems == 1
+        increment_seed  4, shift=0
+        mov             v1.16b,  v4.16b
+        mov             w15, #1
+        bl              output_\lag\()_neon
+        lsr             w2,  w2,  #3
+
+        read_rand       x12, 11,  2
+        read_rand       x13, 11,  1
+        read_rand       x14, 11,  0
+        add             x12, x3,  x12, lsl #1
+        add             x13, x3,  x13, lsl #1
+        add             x14, x3,  x14, lsl #1
+        ld1             {v1.h}[0], [x12]
+        ld1             {v1.h}[1], [x13]
+        ld1             {v1.h}[2], [x14]
+        srshl           v1.4h,   v1.4h,   v31.4h
+        ext             v0.16b,  v0.16b,  v1.16b,  #14
+.endif
+        st1             {v0.8h}, [x0], #16
+        ldr             x30,     [sp], #16
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+.endif
+.endm
+
+.macro sum_lag1_func type, uv_layout, edge, elems=8
+function sum_\type\()_lag1_\edge\()_neon
+        AARCH64_SIGN_LINK_REGISTER
+        str             x30, [sp, #-16]!
+.ifc \edge, left
+        sub             x12, x0,  #1*GRAIN_WIDTH*2
+        ld1             {v17.8h}, [x12] // load the previous block right above
+.endif
+        sum_lag_n_body  lag1, \type, \uv_layout, \edge, \elems
+endfunc
+.endm
+
+sum_lag1_func y,      0,   left
+sum_lag1_func y,      0,   mid
+sum_lag1_func y,      0,   right, 7
+sum_lag1_func uv_444, 444, left
+sum_lag1_func uv_444, 444, mid
+sum_lag1_func uv_444, 444, right, 7
+sum_lag1_func uv_422, 422, left
+sum_lag1_func uv_422, 422, mid
+sum_lag1_func uv_422, 422, right, 1
+sum_lag1_func uv_420, 420, left
+sum_lag1_func uv_420, 420, mid
+sum_lag1_func uv_420, 420, right, 1
+
+
+function sum_lag2_above_neon
+        sub             x12, x0,  #2*GRAIN_WIDTH*2 - 16
+        sub             x13, x0,  #1*GRAIN_WIDTH*2 - 16
+        ld1             {v18.8h}, [x12] // load top right
+        ld1             {v21.8h}, [x13]
+
+        dup             v26.8b,  v30.b[0]
+        ext             v22.16b, v16.16b, v17.16b, #12 // top left, top mid
+        dup             v27.8b,  v30.b[1]
+        ext             v23.16b, v16.16b, v17.16b, #14
+        sxtl            v26.8h,  v26.8b
+        dup             v28.8b,  v30.b[3]
+        ext             v0.16b,  v17.16b, v18.16b, #2  // top mid, top right
+        sxtl            v27.8h,  v27.8b
+        dup             v29.8b,  v30.b[4]
+        ext             v1.16b,  v17.16b, v18.16b, #4
+        sxtl            v28.8h,  v28.8b
+        sxtl            v29.8h,  v29.8b
+
+        smull           v4.4s,   v22.4h,  v26.4h
+        smlal           v4.4s,   v23.4h,  v27.4h
+        smlal           v4.4s,   v0.4h,   v28.4h
+        smlal           v4.4s,   v1.4h,   v29.4h
+        smull2          v5.4s,   v22.8h,  v26.8h
+        smlal2          v5.4s,   v23.8h,  v27.8h
+        smlal2          v5.4s,   v0.8h,   v28.8h
+        smlal2          v5.4s,   v1.8h,   v29.8h
+
+        dup             v26.16b, v30.b[5]
+        ext             v22.16b, v19.16b, v20.16b, #12 // top left, top mid
+        dup             v27.16b, v30.b[6]
+        ext             v23.16b, v19.16b, v20.16b, #14
+        sxtl            v26.8h,  v26.8b
+        dup             v28.16b, v30.b[8]
+        ext             v0.16b,  v20.16b, v21.16b, #2  // top mid, top right
+        sxtl            v27.8h,  v27.8b
+        dup             v29.16b, v30.b[9]
+        ext             v1.16b,  v20.16b, v21.16b, #4
+        sxtl            v28.8h,  v28.8b
+        sxtl            v29.8h,  v29.8b
+
+        smlal           v4.4s,   v22.4h,  v26.4h
+        smlal           v4.4s,   v23.4h,  v27.4h
+        smlal           v4.4s,   v0.4h,   v28.4h
+        smlal           v4.4s,   v1.4h,   v29.4h
+        smlal2          v5.4s,   v22.8h,  v26.8h
+        smlal2          v5.4s,   v23.8h,  v27.8h
+        smlal2          v5.4s,   v0.8h,   v28.8h
+        smlal2          v5.4s,   v1.8h,   v29.8h
+
+        dup             v26.16b, v30.b[2]
+        dup             v27.16b, v30.b[7]
+        sxtl            v26.8h,  v26.8b
+        sxtl            v27.8h,  v27.8b
+
+        smlal           v4.4s,   v17.4h,  v26.4h
+        smlal           v4.4s,   v20.4h,  v27.4h
+        smlal2          v5.4s,   v17.8h,  v26.8h
+        smlal2          v5.4s,   v20.8h,  v27.8h
+        mov             v16.16b, v17.16b
+        mov             v17.16b, v18.16b
+
+        mov             v19.16b, v20.16b
+        mov             v20.16b, v21.16b
+        ret
+endfunc
+
+.macro sum_lag2_func type, uv_layout, edge, elems=8
+function sum_\type\()_lag2_\edge\()_neon
+        AARCH64_SIGN_LINK_REGISTER
+        str             x30, [sp, #-16]!
+.ifc \edge, left
+        sub             x12, x0,  #2*GRAIN_WIDTH*2
+        sub             x13, x0,  #1*GRAIN_WIDTH*2
+        ld1             {v17.8h}, [x12] // load the previous block right above
+        ld1             {v20.8h}, [x13]
+.endif
+        sum_lag_n_body  lag2, \type, \uv_layout, \edge, \elems, v30.b[12]
+endfunc
+.endm
+
+sum_lag2_func y,      0,   left
+sum_lag2_func y,      0,   mid
+sum_lag2_func y,      0,   right, 7
+sum_lag2_func uv_444, 444, left
+sum_lag2_func uv_444, 444, mid
+sum_lag2_func uv_444, 444, right, 7
+sum_lag2_func uv_422, 422, left
+sum_lag2_func uv_422, 422, mid
+sum_lag2_func uv_422, 422, right, 1
+sum_lag2_func uv_420, 420, left
+sum_lag2_func uv_420, 420, mid
+sum_lag2_func uv_420, 420, right, 1
+
+
+function sum_lag3_above_neon
+        sub             x11, x0,  #3*GRAIN_WIDTH*2 - 16
+        sub             x12, x0,  #2*GRAIN_WIDTH*2 - 16
+        sub             x13, x0,  #1*GRAIN_WIDTH*2 - 16
+        ld1             {v15.8h}, [x11] // load top right
+        ld1             {v18.8h}, [x12]
+        ld1             {v21.8h}, [x13]
+
+        dup             v22.8b,  v29.b[0]
+        ext             v8.16b,  v13.16b, v14.16b, #10 // top left, top mid
+        dup             v23.8b,  v29.b[1]
+        ext             v9.16b,  v13.16b, v14.16b, #12
+        sxtl            v22.8h,  v22.8b
+        dup             v24.8b,  v29.b[2]
+        sxtl            v23.8h,  v23.8b
+        dup             v25.8b,  v29.b[3]
+        ext             v10.16b, v13.16b, v14.16b, #14
+        sxtl            v24.8h,  v24.8b
+        dup             v26.8b,  v29.b[4]
+        ext             v11.16b, v14.16b, v15.16b, #2  // top mid, top right
+        sxtl            v25.8h,  v25.8b
+        dup             v27.8b,  v29.b[5]
+        ext             v12.16b, v14.16b, v15.16b, #4
+        sxtl            v26.8h,  v26.8b
+        dup             v28.8b,  v29.b[6]
+        ext             v13.16b, v14.16b, v15.16b, #6
+        sxtl            v27.8h,  v27.8b
+        sxtl            v28.8h,  v28.8b
+
+        smull           v4.4s,   v8.4h,   v22.4h
+        smlal           v4.4s,   v9.4h,   v23.4h
+        smlal           v4.4s,   v10.4h,  v24.4h
+        smlal           v4.4s,   v11.4h,  v26.4h
+        smlal           v4.4s,   v12.4h,  v27.4h
+        smlal           v4.4s,   v13.4h,  v28.4h
+        smlal           v4.4s,   v14.4h,  v25.4h
+        smull2          v5.4s,   v8.8h,   v22.8h
+        smlal2          v5.4s,   v9.8h,   v23.8h
+        smlal2          v5.4s,   v10.8h,  v24.8h
+        smlal2          v5.4s,   v11.8h,  v26.8h
+        smlal2          v5.4s,   v12.8h,  v27.8h
+        smlal2          v5.4s,   v13.8h,  v28.8h
+        smlal2          v5.4s,   v14.8h,  v25.8h
+
+        dup             v22.8b,  v29.b[7]
+        ext             v8.16b,  v16.16b, v17.16b, #10 // top left, top mid
+        dup             v23.8b,  v29.b[8]
+        ext             v9.16b,  v16.16b, v17.16b, #12
+        sxtl            v22.8h,  v22.8b
+        dup             v24.8b,  v29.b[9]
+        sxtl            v23.8h,  v23.8b
+        dup             v25.8b,  v29.b[10]
+        ext             v10.16b, v16.16b, v17.16b, #14
+        sxtl            v24.8h,  v24.8b
+        dup             v26.8b,  v29.b[11]
+        ext             v11.16b, v17.16b, v18.16b, #2  // top mid, top right
+        sxtl            v25.8h,  v25.8b
+        dup             v27.8b,  v29.b[12]
+        ext             v12.16b, v17.16b, v18.16b, #4
+        sxtl            v26.8h,  v26.8b
+        dup             v28.8b,  v29.b[13]
+        ext             v13.16b, v17.16b, v18.16b, #6
+        sxtl            v27.8h,  v27.8b
+        sxtl            v28.8h,  v28.8b
+
+        smlal           v4.4s,   v8.4h,   v22.4h
+        smlal           v4.4s,   v9.4h,   v23.4h
+        smlal           v4.4s,   v10.4h,  v24.4h
+        smlal           v4.4s,   v11.4h,  v26.4h
+        smlal           v4.4s,   v12.4h,  v27.4h
+        smlal           v4.4s,   v13.4h,  v28.4h
+        smlal           v4.4s,   v17.4h,  v25.4h
+        smlal2          v5.4s,   v8.8h,   v22.8h
+        smlal2          v5.4s,   v9.8h,   v23.8h
+        smlal2          v5.4s,   v10.8h,  v24.8h
+        smlal2          v5.4s,   v11.8h,  v26.8h
+        smlal2          v5.4s,   v12.8h,  v27.8h
+        smlal2          v5.4s,   v13.8h,  v28.8h
+        smlal2          v5.4s,   v17.8h,  v25.8h
+
+        dup             v22.8b,  v29.b[14]
+        ext             v8.16b,  v19.16b, v20.16b, #10 // top left, top mid
+        dup             v23.8b,  v29.b[15]
+        ext             v9.16b,  v19.16b, v20.16b, #12
+        sxtl            v22.8h,  v22.8b
+        dup             v24.8b,  v30.b[0]
+        sxtl            v23.8h,  v23.8b
+        dup             v25.8b,  v30.b[1]
+        ext             v10.16b, v19.16b, v20.16b, #14
+        sxtl            v24.8h,  v24.8b
+        dup             v26.8b,  v30.b[2]
+        ext             v11.16b, v20.16b, v21.16b, #2  // top mid, top right
+        sxtl            v25.8h,  v25.8b
+        dup             v27.8b,  v30.b[3]
+        ext             v12.16b, v20.16b, v21.16b, #4
+        sxtl            v26.8h,  v26.8b
+        dup             v28.8b,  v30.b[4]
+        ext             v13.16b, v20.16b, v21.16b, #6
+        sxtl            v27.8h,  v27.8b
+        sxtl            v28.8h,  v28.8b
+
+        smlal           v4.4s,   v8.4h,   v22.4h
+        smlal           v4.4s,   v9.4h,   v23.4h
+        smlal           v4.4s,   v10.4h,  v24.4h
+        smlal           v4.4s,   v11.4h,  v26.4h
+        smlal           v4.4s,   v12.4h,  v27.4h
+        smlal           v4.4s,   v13.4h,  v28.4h
+        smlal           v4.4s,   v20.4h,  v25.4h
+        mov             v16.16b, v17.16b
+        mov             v17.16b, v18.16b
+        smlal2          v5.4s,   v8.8h,   v22.8h
+        smlal2          v5.4s,   v9.8h,   v23.8h
+        smlal2          v5.4s,   v10.8h,  v24.8h
+        smlal2          v5.4s,   v11.8h,  v26.8h
+        smlal2          v5.4s,   v12.8h,  v27.8h
+        smlal2          v5.4s,   v13.8h,  v28.8h
+        smlal2          v5.4s,   v20.8h,  v25.8h
+
+        mov             v13.16b, v14.16b
+        mov             v14.16b, v15.16b
+
+        mov             v19.16b, v20.16b
+        mov             v20.16b, v21.16b
+        ret
+endfunc
+
+.macro sum_lag3_func type, uv_layout, edge, elems=8
+function sum_\type\()_lag3_\edge\()_neon
+        AARCH64_SIGN_LINK_REGISTER
+        str             x30, [sp, #-16]!
+.ifc \edge, left
+        sub             x11, x0,  #3*GRAIN_WIDTH*2
+        sub             x12, x0,  #2*GRAIN_WIDTH*2
+        sub             x13, x0,  #1*GRAIN_WIDTH*2
+        ld1             {v14.8h}, [x11] // load the previous block right above
+        ld1             {v17.8h}, [x12]
+        ld1             {v20.8h}, [x13]
+.endif
+        sum_lag_n_body  lag3, \type, \uv_layout, \edge, \elems, v30.b[8]
+endfunc
+.endm
+
+sum_lag3_func y,      0,   left
+sum_lag3_func y,      0,   mid
+sum_lag3_func y,      0,   right, 7
+sum_lag3_func uv_444, 444, left
+sum_lag3_func uv_444, 444, mid
+sum_lag3_func uv_444, 444, right, 7
+sum_lag3_func uv_422, 422, left
+sum_lag3_func uv_422, 422, mid
+sum_lag3_func uv_422, 422, right, 1
+sum_lag3_func uv_420, 420, left
+sum_lag3_func uv_420, 420, mid
+sum_lag3_func uv_420, 420, right, 1
+
+function generate_grain_rows_neon
+        AARCH64_SIGN_LINK_REGISTER
+        str             x30, [sp, #-16]!
+1:
+        mov             w16, #80
+2:
+        bl              get_gaussian_neon
+        srshl           v0.8h,   v0.8h,   v31.8h
+        subs            w16, w16, #8
+        st1             {v0.8h}, [x0], #16
+        b.gt            2b
+        get_grain_2     v0
+        subs            w1,  w1,  #1
+        st1             {v0.s}[0], [x0], #4
+        b.gt            1b
+        ldr             x30, [sp], #16
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+endfunc
+
+function generate_grain_rows_44_neon
+        AARCH64_SIGN_LINK_REGISTER
+        str             x30, [sp, #-16]!
+1:
+        mov             w16, #40
+2:
+        bl              get_gaussian_neon
+        srshl           v0.8h,   v0.8h,   v31.8h
+        subs            w16, w16, #8
+        st1             {v0.8h}, [x0], #16
+        b.gt            2b
+        get_grain_4     v0
+        subs            w1,  w1,  #1
+        st1             {v0.4h}, [x0]
+        add             x0,  x0,  #GRAIN_WIDTH*2-80
+        b.gt            1b
+        ldr             x30, [sp], #16
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+endfunc
+
+function gen_grain_uv_444_lag0_neon
+        AARCH64_SIGN_LINK_REGISTER
+        str             x30, [sp, #-16]!
+        ld1             {v4.8h}, [x19], #16
+gen_grain_uv_lag0_8_start:
+        bl              get_gaussian_neon
+        srshl           v0.8h,   v0.8h,   v31.8h
+gen_grain_uv_lag0_8_add:
+        and             v4.16b,  v4.16b,  v1.16b
+        smull           v2.4s,   v4.4h,   v27.4h
+        smull2          v3.4s,   v4.8h,   v27.8h
+        srshl           v2.4s,   v2.4s,   v28.4s
+        srshl           v3.4s,   v3.4s,   v28.4s
+        sqxtn           v2.4h,   v2.4s
+        sqxtn2          v2.8h,   v3.4s
+        sqadd           v2.8h,   v2.8h,   v0.8h
+        smin            v2.8h,   v2.8h,   v25.8h
+        smax            v2.8h,   v2.8h,   v26.8h
+        st1             {v2.8h}, [x0], #16
+        ldr             x30, [sp], #16
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+endfunc
+
+function gen_grain_uv_420_lag0_8_neon
+        AARCH64_SIGN_LINK_REGISTER
+        add             x12, x19, #GRAIN_WIDTH*2
+        str             x30, [sp, #-16]!
+        ld1             {v16.8h, v17.8h}, [x19], #32
+        ld1             {v18.8h, v19.8h}, [x12]
+        addp            v16.8h,  v16.8h,  v17.8h
+        addp            v17.8h,  v18.8h,  v19.8h
+        add             v16.8h,  v16.8h,  v17.8h
+        srshr           v4.8h,   v16.8h,  #2
+        b               gen_grain_uv_lag0_8_start
+endfunc
+
+function gen_grain_uv_422_lag0_8_neon
+        AARCH64_SIGN_LINK_REGISTER
+        str             x30, [sp, #-16]!
+        ld1             {v16.8h, v17.8h}, [x19], #32
+        addp            v16.8h,  v16.8h,  v17.8h
+        srshr           v4.8h,   v16.8h,  #1
+        b               gen_grain_uv_lag0_8_start
+endfunc
+
+function gen_grain_uv_420_lag0_4_neon
+        add             x12, x19, #GRAIN_WIDTH*2
+        AARCH64_SIGN_LINK_REGISTER
+        str             x30, [sp, #-16]!
+        ld1             {v16.4h, v17.4h}, [x19]
+        ld1             {v18.4h, v19.4h}, [x12]
+        add             x19,  x19,  #32
+        addp            v16.4h,  v16.4h,  v17.4h
+        addp            v17.4h,  v18.4h,  v19.4h
+        add             v16.4h,  v16.4h,  v17.4h
+        srshr           v4.4h,   v16.4h,  #2
+        get_grain_4     v0
+        b               gen_grain_uv_lag0_8_add
+endfunc
+
+function gen_grain_uv_422_lag0_4_neon
+        AARCH64_SIGN_LINK_REGISTER
+        str             x30, [sp, #-16]!
+        ld1             {v16.4h, v17.4h}, [x19]
+        add             x19,  x19,  #32
+        addp            v16.4h,  v16.4h,  v17.4h
+        srshr           v4.4h,   v16.4h,  #1
+        get_grain_4     v0
+        b               gen_grain_uv_lag0_8_add
+endfunc
+
+.macro gen_grain_82 type
+function generate_grain_\type\()_16bpc_neon, export=1
+        AARCH64_SIGN_LINK_REGISTER
+        stp             x30, x19, [sp, #-96]!
+
+.ifc \type, uv_444
+        mov             w13, w3
+        mov             w14, #28
+        add             x19, x1,  #3*GRAIN_WIDTH*2
+        mov             x1,  x2
+        mul             w13, w13, w14
+        clz             w15, w4
+.else
+        clz             w15, w2
+.endif
+        movrel          x3,  X(gaussian_sequence)
+        sub             w15, w15, #24 // -bitdepth_min_8
+        ldr             w2,  [x1, #FGD_SEED]
+        ldr             w9,  [x1, #FGD_GRAIN_SCALE_SHIFT]
+.ifc \type, y
+        add             x4,  x1,  #FGD_AR_COEFFS_Y
+.else
+        add             x4,  x1,  #FGD_AR_COEFFS_UV
+.endif
+        add             w9,  w9,  w15 // grain_scale_shift - bitdepth_min_8
+        adr             x16, L(gen_grain_\type\()_tbl)
+        ldr             w17, [x1, #FGD_AR_COEFF_LAG]
+        add             w9,  w9,  #4
+        ldrh            w17, [x16, w17, uxtw #1]
+        dup             v31.8h,  w9    // 4 - bitdepth_min_8 + data->grain_scale_shift
+        sub             x16, x16, w17, uxtw
+        neg             v31.8h,  v31.8h
+
+.ifc \type, uv_444
+        cmp             w13, #0
+        mov             w11, #0x49d8
+        mov             w14, #0xb524
+        add             x4,  x4,  w13, uxtw // Add offset to ar_coeffs_uv[1]
+        csel            w11, w11, w14, ne
+.endif
+
+        ldr             w7,  [x1, #FGD_AR_COEFF_SHIFT]
+        neg             w15, w15            // bitdepth_min_8
+        mov             w8,  #1
+        mov             w10, #1
+        lsl             w8,  w8,  w7        // 1 << ar_coeff_shift
+        lsl             w10, w10, w9        // 1 << (4 + data->grain_scale_shift)
+        lsr             w8,  w8,  #1        // 1 << (ar_coeff_shift - 1)
+        lsr             w10, w10, #1        // 1 << (4 + data->grain_scale_shift - 1)
+        mov             w5,  #128
+        lsl             w5,  w5,  w15       //   128 << bitdepth_min_8
+        neg             w6,  w5             // -(128 << bitpdeth_min_8)
+        sub             w5,  w5,  #1        //  (128 << bitdepth_min_8) - 1
+
+.ifc \type, uv_444
+        eor             w2,  w2,  w11
+.endif
+
+        br              x16
+
+L(generate_grain_\type\()_lag0):
+        AARCH64_VALID_JUMP_TARGET
+.ifc \type, y
+        mov             w1,  #GRAIN_HEIGHT
+        bl              generate_grain_rows_neon
+.else
+        dup             v28.4s,  w7
+        ld1r            {v27.8b}, [x4]      // ar_coeffs_uv[0]
+        movi            v0.16b,  #0
+        movi            v1.16b,  #255
+        dup             v25.8h,  w5
+        dup             v26.8h,  w6
+        ext             v29.16b, v0.16b,  v1.16b,  #10
+        ext             v30.16b, v1.16b,  v0.16b,  #2
+        neg             v28.4s,  v28.4s
+        sxtl            v27.8h,  v27.8b
+
+        mov             w1,  #3
+        bl              generate_grain_rows_neon
+        mov             w1,  #GRAIN_HEIGHT-3
+1:
+        mov             v1.16b,  v29.16b
+        bl              gen_grain_uv_444_lag0_neon // 8
+        movi            v1.16b,  #255
+        bl              gen_grain_uv_444_lag0_neon // 16
+        bl              gen_grain_uv_444_lag0_neon // 24
+        bl              gen_grain_uv_444_lag0_neon // 32
+        bl              gen_grain_uv_444_lag0_neon // 40
+        bl              gen_grain_uv_444_lag0_neon // 48
+        bl              gen_grain_uv_444_lag0_neon // 56
+        bl              gen_grain_uv_444_lag0_neon // 64
+        bl              gen_grain_uv_444_lag0_neon // 72
+        mov             v1.16b,  v30.16b
+        bl              gen_grain_uv_444_lag0_neon // 80
+        get_grain_2     v16
+        subs            w1,  w1,  #1
+        add             x19, x19, #4
+        st1             {v16.s}[0], [x0], #4
+        b.gt            1b
+.endif
+        ldp             x30, x19, [sp], #96
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+
+L(generate_grain_\type\()_lag1):
+        AARCH64_VALID_JUMP_TARGET
+        ld1r            {v27.8b}, [x4], #1  // ar_coeffs_y[0]
+        ld1r            {v28.8b}, [x4], #1  // ar_coeffs_y[1]
+        ld1r            {v29.8b}, [x4]      // ar_coeffs_y[2]
+.ifc \type, y
+        ldrsb           w4,  [x4, #1]       // ar_coeffs_y[3]
+.else
+        add             x4,  x4,  #2
+.endif
+
+        mov             w1,  #3
+.ifc \type, uv_444
+        ld1r            {v30.8b}, [x4]      // ar_coeffs_uv[4]
+        ldursb          w4,  [x4, #-1]      // ar_coeffs_uv[3]
+.endif
+        bl              generate_grain_rows_neon
+        sxtl            v27.8h,  v27.8b
+        sxtl            v28.8h,  v28.8b
+        sxtl            v29.8h,  v29.8b
+.ifc \type, uv_444
+        sxtl            v30.8h,  v30.8b
+.endif
+
+        mov             w1,  #GRAIN_HEIGHT - 3
+1:
+        bl              sum_\type\()_lag1_left_neon  // 8
+        bl              sum_\type\()_lag1_mid_neon   // 16
+        bl              sum_\type\()_lag1_mid_neon   // 24
+        bl              sum_\type\()_lag1_mid_neon   // 32
+        bl              sum_\type\()_lag1_mid_neon   // 40
+        bl              sum_\type\()_lag1_mid_neon   // 48
+        bl              sum_\type\()_lag1_mid_neon   // 56
+        bl              sum_\type\()_lag1_mid_neon   // 64
+        bl              sum_\type\()_lag1_mid_neon   // 72
+        bl              sum_\type\()_lag1_right_neon // 80
+        get_grain_2     v16
+        subs            w1,  w1,  #1
+.ifc \type, uv_444
+        add             x19, x19, #4
+.endif
+        st1             {v16.s}[0], [x0], #4
+        b.gt            1b
+
+        ldp             x30, x19, [sp], #96
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+
+L(generate_grain_\type\()_lag2):
+        AARCH64_VALID_JUMP_TARGET
+        ld1             {v30.16b}, [x4]     // ar_coeffs_y[0-11], ar_coeffs_uv[0-12]
+
+        smov            w4,  v30.b[10]
+        smov            w17, v30.b[11]
+
+        mov             w1,  #3
+        bl              generate_grain_rows_neon
+
+        mov             w1,  #GRAIN_HEIGHT - 3
+1:
+        bl              sum_\type\()_lag2_left_neon  // 8
+        bl              sum_\type\()_lag2_mid_neon   // 16
+        bl              sum_\type\()_lag2_mid_neon   // 24
+        bl              sum_\type\()_lag2_mid_neon   // 32
+        bl              sum_\type\()_lag2_mid_neon   // 40
+        bl              sum_\type\()_lag2_mid_neon   // 48
+        bl              sum_\type\()_lag2_mid_neon   // 56
+        bl              sum_\type\()_lag2_mid_neon   // 64
+        bl              sum_\type\()_lag2_mid_neon   // 72
+        bl              sum_\type\()_lag2_right_neon // 80
+        get_grain_2     v16
+        subs            w1,  w1,  #1
+.ifc \type, uv_444
+        add             x19, x19, #4
+.endif
+        st1             {v16.s}[0], [x0], #4
+        b.gt            1b
+
+        ldp             x30, x19, [sp], #96
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+
+L(generate_grain_\type\()_lag3):
+        AARCH64_VALID_JUMP_TARGET
+        ld1             {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
+        stp             d8,  d9,  [sp, #16]
+        stp             d10, d11, [sp, #32]
+        stp             d12, d13, [sp, #48]
+        stp             d14, d15, [sp, #64]
+        stp             x20, x21, [sp, #80]
+
+        smov            w4,  v30.b[5]
+        smov            w20, v30.b[6]
+        smov            w21, v30.b[7]
+
+        mov             w1,  #3
+        bl              generate_grain_rows_neon
+
+        mov             w1,  #GRAIN_HEIGHT - 3
+1:
+        bl              sum_\type\()_lag3_left_neon  // 8
+        bl              sum_\type\()_lag3_mid_neon   // 16
+        bl              sum_\type\()_lag3_mid_neon   // 24
+        bl              sum_\type\()_lag3_mid_neon   // 32
+        bl              sum_\type\()_lag3_mid_neon   // 40
+        bl              sum_\type\()_lag3_mid_neon   // 48
+        bl              sum_\type\()_lag3_mid_neon   // 56
+        bl              sum_\type\()_lag3_mid_neon   // 64
+        bl              sum_\type\()_lag3_mid_neon   // 72
+        bl              sum_\type\()_lag3_right_neon // 80
+        get_grain_2     v16
+        subs            w1,  w1,  #1
+.ifc \type, uv_444
+        add             x19, x19, #4
+.endif
+        st1             {v16.s}[0], [x0], #4
+        b.gt            1b
+
+        ldp             x20, x21, [sp, #80]
+        ldp             d14, d15, [sp, #64]
+        ldp             d12, d13, [sp, #48]
+        ldp             d10, d11, [sp, #32]
+        ldp             d8,  d9,  [sp, #16]
+        ldp             x30, x19, [sp], #96
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+
+L(gen_grain_\type\()_tbl):
+        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
+        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
+        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
+        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
+endfunc
+.endm
+
+gen_grain_82 y
+gen_grain_82 uv_444
+
+.macro set_height dst, type
+.ifc \type, uv_420
+        mov             \dst,  #SUB_GRAIN_HEIGHT-3
+.else
+        mov             \dst,  #GRAIN_HEIGHT-3
+.endif
+.endm
+
+.macro increment_y_ptr reg, type
+.ifc \type, uv_420
+        add             \reg, \reg, #2*GRAIN_WIDTH*2-(6*32)
+.else
+        sub             \reg, \reg, #6*32-GRAIN_WIDTH*2
+.endif
+.endm
+
+.macro gen_grain_44 type
+function generate_grain_\type\()_16bpc_neon, export=1
+        AARCH64_SIGN_LINK_REGISTER
+        stp             x30, x19, [sp, #-96]!
+
+        mov             w13, w3
+        mov             w14, #28
+        add             x19, x1,  #(3*GRAIN_WIDTH-3)*2
+        mov             x1,  x2
+        mul             w13, w13, w14
+        clz             w15, w4
+
+        movrel          x3,  X(gaussian_sequence)
+        sub             w15, w15, #24 // -bitdepth_min_8
+        ldr             w2,  [x1, #FGD_SEED]
+        ldr             w9,  [x1, #FGD_GRAIN_SCALE_SHIFT]
+        add             x4,  x1,  #FGD_AR_COEFFS_UV
+        add             w9,  w9,  w15 // grain_scale_shift - bitdepth_min_8
+        adr             x16, L(gen_grain_\type\()_tbl)
+        ldr             w17, [x1, #FGD_AR_COEFF_LAG]
+        add             w9,  w9,  #4
+        ldrh            w17, [x16, w17, uxtw #1]
+        dup             v31.8h,  w9    // 4 - bitdepth_min_8 + data->grain_scale_shift
+        sub             x16, x16, w17, uxtw
+        neg             v31.8h,  v31.8h
+
+        cmp             w13, #0
+        mov             w11, #0x49d8
+        mov             w14, #0xb524
+        add             x4,  x4,  w13, uxtw // Add offset to ar_coeffs_uv[1]
+        csel            w11, w11, w14, ne
+
+        ldr             w7,  [x1, #FGD_AR_COEFF_SHIFT]
+        neg             w15, w15            // bitdepth_min_8
+        mov             w8,  #1
+        mov             w10, #1
+        lsl             w8,  w8,  w7        // 1 << ar_coeff_shift
+        lsl             w10, w10, w9        // 1 << (4 + data->grain_scale_shift)
+        lsr             w8,  w8,  #1        // 1 << (ar_coeff_shift - 1)
+        lsr             w10, w10, #1        // 1 << (4 + data->grain_scale_shift - 1)
+        mov             w5,  #128
+        lsl             w5,  w5,  w15       //   128 << bitdepth_min_8
+        neg             w6,  w5             // -(128 << bitpdeth_min_8)
+        sub             w5,  w5,  #1        //  (128 << bitdepth_min_8) - 1
+
+        eor             w2,  w2,  w11
+
+        br              x16
+
+L(generate_grain_\type\()_lag0):
+        AARCH64_VALID_JUMP_TARGET
+        dup             v28.4s,  w7
+        ld1r            {v27.8b}, [x4]      // ar_coeffs_uv[0]
+        movi            v0.16b,  #0
+        movi            v1.16b,  #255
+        dup             v25.8h,  w5
+        dup             v26.8h,  w6
+        ext             v29.16b, v0.16b,  v1.16b,  #10
+        ext             v30.16b, v1.16b,  v0.16b,  #14
+        neg             v28.4s,  v28.4s
+        sxtl            v27.8h,  v27.8b
+
+        mov             w1,  #3
+        bl              generate_grain_rows_44_neon
+        set_height      w1,  \type
+1:
+        mov             v1.16b,  v29.16b
+        bl              gen_grain_\type\()_lag0_8_neon // 8
+        movi            v1.16b,  #255
+        bl              gen_grain_\type\()_lag0_8_neon // 16
+        bl              gen_grain_\type\()_lag0_8_neon // 24
+        bl              gen_grain_\type\()_lag0_8_neon // 32
+        bl              gen_grain_\type\()_lag0_8_neon // 40
+        mov             v1.16b,  v30.16b
+        bl              gen_grain_\type\()_lag0_4_neon // 44
+        subs            w1,  w1,  #1
+        increment_y_ptr x19, \type
+        add             x0,  x0,  #GRAIN_WIDTH*2-6*16
+        b.gt            1b
+
+        ldp             x30, x19, [sp], #96
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+
+L(generate_grain_\type\()_lag1):
+        AARCH64_VALID_JUMP_TARGET
+        ld1r            {v27.8b}, [x4], #1  // ar_coeffs_uv[0]
+        ld1r            {v28.8b}, [x4], #1  // ar_coeffs_uv[1]
+        ld1r            {v29.8b}, [x4]      // ar_coeffs_uv[2]
+        add             x4,  x4,  #2
+
+        mov             w1,  #3
+        ld1r            {v30.8b}, [x4]      // ar_coeffs_u4[4]
+        ldursb          w4,  [x4, #-1]      // ar_coeffs_uv[3]
+        bl              generate_grain_rows_44_neon
+
+        sxtl            v27.8h,  v27.8b
+        sxtl            v28.8h,  v28.8b
+        sxtl            v29.8h,  v29.8b
+        sxtl            v30.8h,  v30.8b
+        set_height      w1,  \type
+1:
+        bl              sum_\type\()_lag1_left_neon  // 8
+        bl              sum_\type\()_lag1_mid_neon   // 16
+        bl              sum_\type\()_lag1_mid_neon   // 24
+        bl              sum_\type\()_lag1_mid_neon   // 32
+        bl              sum_\type\()_lag1_mid_neon   // 40
+        bl              sum_\type\()_lag1_right_neon // 44
+        subs            w1,  w1,  #1
+        increment_y_ptr x19, \type
+        add             x0,  x0,  #GRAIN_WIDTH*2-6*16
+        b.gt            1b
+
+        ldp             x30, x19, [sp], #96
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+
+L(generate_grain_\type\()_lag2):
+        AARCH64_VALID_JUMP_TARGET
+        ld1             {v30.16b}, [x4]     // ar_coeffs_uv[0-12]
+
+        smov            w4,  v30.b[10]
+        smov            w17, v30.b[11]
+
+        mov             w1,  #3
+        bl              generate_grain_rows_44_neon
+
+        set_height      w1,  \type
+1:
+        bl              sum_\type\()_lag2_left_neon  // 8
+        bl              sum_\type\()_lag2_mid_neon   // 16
+        bl              sum_\type\()_lag2_mid_neon   // 24
+        bl              sum_\type\()_lag2_mid_neon   // 32
+        bl              sum_\type\()_lag2_mid_neon   // 40
+        bl              sum_\type\()_lag2_right_neon // 44
+        subs            w1,  w1,  #1
+        increment_y_ptr x19, \type
+        add             x0,  x0,  #GRAIN_WIDTH*2-6*16
+        b.gt            1b
+
+        ldp             x30, x19, [sp], #96
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+
+L(generate_grain_\type\()_lag3):
+        AARCH64_VALID_JUMP_TARGET
+        ldr             q29,      [x4]      // ar_coeffs_uv[0-15]
+        ldr             q30,      [x4, #16] // ar_coeffs_uv[16-24]
+        stp             d8,  d9,  [sp, #16]
+        stp             d10, d11, [sp, #32]
+        stp             d12, d13, [sp, #48]
+        stp             d14, d15, [sp, #64]
+        stp             x20, x21, [sp, #80]
+
+        smov            w4,  v30.b[5]
+        smov            w20, v30.b[6]
+        smov            w21, v30.b[7]
+
+        mov             w1,  #3
+        bl              generate_grain_rows_44_neon
+
+        set_height      w1,  \type
+1:
+        bl              sum_\type\()_lag3_left_neon  // 8
+        bl              sum_\type\()_lag3_mid_neon   // 16
+        bl              sum_\type\()_lag3_mid_neon   // 24
+        bl              sum_\type\()_lag3_mid_neon   // 32
+        bl              sum_\type\()_lag3_mid_neon   // 40
+        bl              sum_\type\()_lag3_right_neon // 44
+        subs            w1,  w1,  #1
+        increment_y_ptr x19, \type
+        add             x0,  x0,  #GRAIN_WIDTH*2-6*16
+        b.gt            1b
+
+        ldp             x20, x21, [sp, #80]
+        ldp             d14, d15, [sp, #64]
+        ldp             d12, d13, [sp, #48]
+        ldp             d10, d11, [sp, #32]
+        ldp             d8,  d9,  [sp, #16]
+        ldp             x30, x19, [sp], #96
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+
+L(gen_grain_\type\()_tbl):
+        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
+        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
+        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
+        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
+endfunc
+.endm
+
+gen_grain_44 uv_420
+gen_grain_44 uv_422
+
+.macro gather_interleaved dst1, dst2, src1, src2, off
+        umov            w14, \src1[0]
+        umov            w15, \src2[1]
+        umov            w16, \src1[2]
+        add             x14, x14, x3
+        umov            w17, \src2[3]
+        add             x15, x15, x3
+        ld1             {\dst1}[0+\off], [x14]
+        umov            w14, \src1[4]
+        add             x16, x16, x3
+        ld1             {\dst2}[1+\off], [x15]
+        umov            w15, \src2[5]
+        add             x17, x17, x3
+        ld1             {\dst1}[2+\off], [x16]
+        umov            w16, \src1[6]
+        add             x14, x14, x3
+        ld1             {\dst2}[3+\off], [x17]
+        umov            w17, \src2[7]
+        add             x15, x15, x3
+        ld1             {\dst1}[4+\off], [x14]
+        add             x16, x16, x3
+        ld1             {\dst2}[5+\off], [x15]
+        add             x17, x17, x3
+        ld1             {\dst1}[6+\off], [x16]
+        ld1             {\dst2}[7+\off], [x17]
+.endm
+
+.macro gather dst1, dst2, src1, src2, src3, src4
+        gather_interleaved \dst1, \dst2, \src1, \src3, 0
+        gather_interleaved \dst2, \dst1, \src3, \src1, 0
+        gather_interleaved \dst1, \dst2, \src2, \src4, 8
+        gather_interleaved \dst2, \dst1, \src4, \src2, 8
+.endm
+
+function gather32_neon
+        gather          v6.b, v7.b, v0.h, v1.h, v2.h, v3.h
+        ret
+endfunc
+
+function gather16_neon
+        gather_interleaved v6.b, v7.b, v0.h, v1.h, 0
+        gather_interleaved v7.b, v6.b, v1.h, v0.h, 0
+        ins             v6.d[1], v7.d[0]
+        ret
+endfunc
+
+const overlap_coeffs_0, align=4
+        .short 27, 17, 0,  0
+        .short 17, 27, 32, 32
+endconst
+
+const overlap_coeffs_1, align=4
+        .short 23, 0,  0,  0
+        .short 22, 32, 32, 32
+endconst
+
+.macro calc_offset offx, offy, src, sx, sy
+        and             \offy, \src,  #0xF     // randval & 0xF
+        lsr             \offx, \src,  #4       // randval >> 4
+.if \sy == 0
+        add             \offy, \offy, \offy    // 2 * (randval & 0xF)
+.endif
+.if \sx == 0
+        add             \offx, \offx, \offx    // 2 * (randval >> 4)
+.endif
+.endm
+
+.macro add_offset dst, offx, offy, src, stride
+        madd            \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
+        add             \dst, \dst, \offx, uxtw #1 // grain_lut += offx
+.endm
+
+// void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src,
+//                                 const ptrdiff_t stride,
+//                                 const uint8_t scaling[SCALING_SIZE],
+//                                 const int scaling_shift,
+//                                 const entry grain_lut[][GRAIN_WIDTH],
+//                                 const int offsets[][2],
+//                                 const int h, const ptrdiff_t clip,
+//                                 const ptrdiff_t type,
+//                                 const int bitdepth_max);
+function fgy_32x32_16bpc_neon, export=1
+        AARCH64_SIGN_LINK_REGISTER
+        str             x30, [sp, #-80]!
+        stp             d8,  d9,  [sp, #16]
+        stp             d10, d11, [sp, #32]
+        stp             d12, d13, [sp, #48]
+        str             d14,      [sp, #64]
+        eor             w4,  w4,  #15          // 15 - scaling_shift
+        ldr             w11, [x6, #8]          // offsets[1][0]
+        ldr             w13, [x6, #4]          // offsets[0][1]
+        ldr             w15, [x6, #12]         // offsets[1][1]
+        ldr             w10, [sp, #96]         // bitdepth_max
+        ldr             w6,  [x6]              // offsets[0][0]
+        dup             v26.8h,  w10           // bitdepth_max
+        clz             w10, w10
+        ldr             w8,  [sp, #80]         // clip
+        sub             w10, w10, #24          // -bitdepth_min_8
+        mov             x9,  #GRAIN_WIDTH*2    // grain_lut stride
+        neg             w10, w10               // bitdepth_min_8
+
+        dup             v29.8h,  w4            // 15 - scaling_shift
+        dup             v27.8h,  w10           // bitdepth_min_8
+
+        movrel          x16, overlap_coeffs_0
+
+        cbz             w8,  1f
+        // clip
+        movi            v30.8h,  #16
+        movi            v31.8h,  #235
+        sshl            v30.8h,  v30.8h,  v27.8h
+        sshl            v31.8h,  v31.8h,  v27.8h
+        b               2f
+1:
+        // no clip
+        movi            v30.8h,  #0
+        mov             v31.16b, v26.16b       // bitdepth_max
+2:
+
+        ushr            v26.8h,  v26.8h,  #1   // grain_max
+        not             v25.16b, v26.16b       // grain_min
+
+        ld1             {v27.4h, v28.4h}, [x16] // overlap_coeffs
+
+        add             x5,  x5,  #18          // grain_lut += 9
+        add             x5,  x5,  x9,  lsl #3  // grain_lut += 8 * grain_stride
+        add             x5,  x5,  x9           // grain_lut += grain_stride
+
+        calc_offset     w11, w12, w11, 0,  0
+        calc_offset     w13, w14, w13, 0,  0
+        calc_offset     w15, w16, w15, 0,  0
+        calc_offset     w6,  w10, w6,  0,  0
+
+        add_offset      x12, w11, x12, x5,  x9
+        add_offset      x14, w13, x14, x5,  x9
+        add_offset      x16, w15, x16, x5,  x9
+        add_offset      x5,  w6,  x10, x5,  x9
+
+        ldr             w11, [sp, #88]         // type
+        adr             x13, L(fgy_loop_tbl)
+
+        add             x4,  x12, #32*2        // grain_lut += BLOCK_SIZE * bx
+        add             x6,  x14, x9,  lsl #5  // grain_lut += grain_stride * BLOCK_SIZE * by
+
+        tst             w11, #1
+        ldrh            w11, [x13, w11, uxtw #1]
+
+        add             x8,  x16, x9,  lsl #5  // grain_lut += grain_stride * BLOCK_SIZE * by
+        add             x8,  x8,  #32*2        // grain_lut += BLOCK_SIZE * bx
+
+        sub             x11, x13, w11, uxtw
+
+        b.eq            1f
+        // y overlap
+        dup             v8.8h,   v27.h[0]
+        dup             v9.8h,   v27.h[1]
+        mov             w10, w7                // backup actual h
+        mov             w7,  #2
+1:
+        br              x11
+endfunc
+
+function fgy_loop_neon
+.macro fgy ox, oy
+L(loop_\ox\oy):
+        AARCH64_VALID_JUMP_TARGET
+1:
+        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x1],  x2 // src
+.if \ox
+        ld1             {v20.4h},                         [x4],  x9 // grain_lut old
+.endif
+.if \oy
+        ld1             {v21.8h, v22.8h, v23.8h, v24.8h}, [x6],  x9 // grain_lut top
+.endif
+.if \ox && \oy
+        ld1             {v14.4h},                         [x8],  x9 // grain_lut top old
+.endif
+        mvni            v4.8h,   #0xf0, lsl #8 // 0x0fff
+        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x5],  x9 // grain_lut
+
+        // Make sure that uninitialized pixels out of range past the right
+        // edge are in range; their actual values shouldn't matter.
+        and             v0.16b,  v0.16b,  v4.16b
+        and             v1.16b,  v1.16b,  v4.16b
+        and             v2.16b,  v2.16b,  v4.16b
+        and             v3.16b,  v3.16b,  v4.16b
+        bl              gather32_neon
+
+.if \ox
+        smull           v20.4s,  v20.4h,  v27.4h
+        smlal           v20.4s,  v16.4h,  v28.4h
+.endif
+
+.if \oy
+.if \ox
+        smull           v14.4s,  v14.4h,  v27.4h
+        smlal           v14.4s,  v21.4h,  v28.4h
+        sqrshrn         v20.4h,  v20.4s,  #5
+        sqrshrn         v14.4h,  v14.4s,  #5
+        smin            v20.4h,  v20.4h,  v26.4h
+        smin            v14.4h,  v14.4h,  v26.4h
+        smax            v20.4h,  v20.4h,  v25.4h
+        smax            v14.4h,  v14.4h,  v25.4h
+.endif
+
+.if \ox
+        smull           v10.4s,  v20.4h,  v9.4h
+.else
+        smull           v10.4s,  v16.4h,  v9.4h
+.endif
+        smull2          v11.4s,  v16.8h,  v9.8h
+        smull           v12.4s,  v17.4h,  v9.4h
+        smull2          v13.4s,  v17.8h,  v9.8h
+        smull           v16.4s,  v18.4h,  v9.4h
+        smull2          v17.4s,  v18.8h,  v9.8h
+        smull           v18.4s,  v19.4h,  v9.4h
+        smull2          v19.4s,  v19.8h,  v9.8h
+.if \ox
+        smlal           v10.4s,  v14.4h,  v8.4h
+.else
+        smlal           v10.4s,  v21.4h,  v8.4h
+.endif
+        smlal2          v11.4s,  v21.8h,  v8.8h
+        smlal           v12.4s,  v22.4h,  v8.4h
+        smlal2          v13.4s,  v22.8h,  v8.8h
+        smlal           v16.4s,  v23.4h,  v8.4h
+        smlal2          v17.4s,  v23.8h,  v8.8h
+        smlal           v18.4s,  v24.4h,  v8.4h
+        smlal2          v19.4s,  v24.8h,  v8.8h
+        sqrshrn         v10.4h,  v10.4s,  #5
+        sqrshrn2        v10.8h,  v11.4s,  #5
+        sqrshrn         v11.4h,  v12.4s,  #5
+        sqrshrn2        v11.8h,  v13.4s,  #5
+        sqrshrn         v12.4h,  v16.4s,  #5
+        sqrshrn2        v12.8h,  v17.4s,  #5
+        sqrshrn         v13.4h,  v18.4s,  #5
+        sqrshrn2        v13.8h,  v19.4s,  #5
+        smin            v16.8h,  v10.8h,  v26.8h
+        smin            v17.8h,  v11.8h,  v26.8h
+        smin            v18.8h,  v12.8h,  v26.8h
+        smin            v19.8h,  v13.8h,  v26.8h
+        smax            v16.8h,  v16.8h,  v25.8h
+        smax            v17.8h,  v17.8h,  v25.8h
+        smax            v18.8h,  v18.8h,  v25.8h
+        smax            v19.8h,  v19.8h,  v25.8h
+.endif
+
+        uxtl            v4.8h,   v6.8b            // scaling
+.if \ox && !\oy
+        sqrshrn         v20.4h,  v20.4s,  #5
+.endif
+        uxtl2           v5.8h,   v6.16b
+.if \ox && !\oy
+        smin            v20.4h,  v20.4h,  v26.4h
+.endif
+        uxtl            v6.8h,   v7.8b
+.if \ox && !\oy
+        smax            v20.4h,  v20.4h,  v25.4h
+.endif
+        uxtl2           v7.8h,   v7.16b
+.if \ox && !\oy
+        ins             v16.d[0], v20.d[0]
+.endif
+        ushl            v4.8h,   v4.8h,   v29.8h  // scaling << (15 - scaling_shift)
+        ushl            v5.8h,   v5.8h,   v29.8h
+        ushl            v6.8h,   v6.8h,   v29.8h
+        ushl            v7.8h,   v7.8h,   v29.8h
+
+        sqrdmulh        v20.8h,  v16.8h,  v4.8h   // round2((scaling << (15 - scaling_shift) * grain, 15)
+        sqrdmulh        v21.8h,  v17.8h,  v5.8h
+        sqrdmulh        v22.8h,  v18.8h,  v6.8h
+        sqrdmulh        v23.8h,  v19.8h,  v7.8h
+
+        usqadd          v0.8h,   v20.8h           // *src + noise
+        usqadd          v1.8h,   v21.8h
+        usqadd          v2.8h,   v22.8h
+        usqadd          v3.8h,   v23.8h
+
+        umax            v0.8h,   v0.8h,   v30.8h
+        umax            v1.8h,   v1.8h,   v30.8h
+        umax            v2.8h,   v2.8h,   v30.8h
+        umax            v3.8h,   v3.8h,   v30.8h
+        umin            v0.8h,   v0.8h,   v31.8h
+        umin            v1.8h,   v1.8h,   v31.8h
+        umin            v2.8h,   v2.8h,   v31.8h
+        umin            v3.8h,   v3.8h,   v31.8h
+
+        subs            w7,  w7,  #1
+.if \oy
+        dup             v8.8h,   v28.h[0]
+        dup             v9.8h,   v28.h[1]
+.endif
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h},  [x0], x2 // dst
+        b.gt            1b
+
+.if \oy
+        cmp             w10, #2
+        sub             w7,  w10, #2           // restore actual remaining h
+        b.gt            L(loop_\ox\()0)
+.endif
+        ldr             d14,      [sp, #64]
+        ldp             d12, d13, [sp, #48]
+        ldp             d10, d11, [sp, #32]
+        ldp             d8,  d9,  [sp, #16]
+        ldr             x30, [sp], #80
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+.endm
+
+        fgy             0, 0
+        fgy             0, 1
+        fgy             1, 0
+        fgy             1, 1
+
+L(fgy_loop_tbl):
+        .hword L(fgy_loop_tbl) - L(loop_00)
+        .hword L(fgy_loop_tbl) - L(loop_01)
+        .hword L(fgy_loop_tbl) - L(loop_10)
+        .hword L(fgy_loop_tbl) - L(loop_11)
+endfunc
+
+// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst,
+//                                      const pixel *const src,
+//                                      const ptrdiff_t stride,
+//                                      const uint8_t scaling[SCALING_SIZE],
+//                                      const Dav1dFilmGrainData *const data,
+//                                      const entry grain_lut[][GRAIN_WIDTH],
+//                                      const pixel *const luma_row,
+//                                      const ptrdiff_t luma_stride,
+//                                      const int offsets[][2],
+//                                      const ptrdiff_t h, const ptrdiff_t uv,
+//                                      const ptrdiff_t is_id,
+//                                      const ptrdiff_t type,
+//                                      const int bitdepth_max);
+.macro fguv layout, sx, sy
+function fguv_32x32_\layout\()_16bpc_neon, export=1
+        AARCH64_SIGN_LINK_REGISTER
+        str             x30,      [sp, #-80]!
+        stp             d8,  d9,  [sp, #16]
+        stp             d10, d11, [sp, #32]
+        stp             d12, d13, [sp, #48]
+        stp             d14, d15, [sp, #64]
+
+        ldp             x8,  x9,  [sp, #80]    // offsets, h
+        ldp             x10, x11, [sp, #96]    // uv, is_id
+        ldr             w16,      [sp, #120]   // bitdepth_max
+
+        ldr             w13, [x4, #FGD_SCALING_SHIFT]
+        ldr             w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE]
+        dup             v23.8h,  w16           // bitdepth_max
+        clz             w16, w16
+        eor             w13, w13, #15          // 15 - scaling_shift
+        sub             w16, w16, #24          // -bitdepth_min_8
+
+        // !csfl
+        add             x10, x4,  x10, lsl #2  // + 4*uv
+        add             x14, x10, #FGD_UV_LUMA_MULT
+        add             x15, x10, #FGD_UV_MULT
+        add             x10, x10, #FGD_UV_OFFSET
+        neg             w16, w16               // bitdepth_min_8
+        ld1r            {v8.8h},  [x14]        // uv_luma_mult
+        ld1r            {v24.8h}, [x10]        // uv_offset
+        ld1r            {v9.8h},  [x15]        // uv_mult
+
+        dup             v29.8h,  w13           // 15 - scaling_shift
+        dup             v27.8h,  w16           // bitdepth_min_8
+
+        cbz             w12, 1f
+        // clip
+        movi            v30.8h,  #16
+        movi            v31.8h,  #240
+        sshl            v30.8h,  v30.8h,  v27.8h
+        sshl            v31.8h,  v31.8h,  v27.8h
+        cbz             w11, 2f
+        // is_id
+        movi            v31.8h,  #235
+        sshl            v31.8h,  v31.8h,  v27.8h
+        b               2f
+1:
+        // no clip
+        movi            v30.8h,  #0
+        mov             v31.16b, v23.16b       // bitdepth_max
+2:
+
+        ushr            v15.8h,  v23.8h,  #1   // grain_max
+        sshl            v24.8h,  v24.8h,  v27.8h // uv_offset << bitdepth_min_8
+        not             v14.16b, v15.16b       // grain_min
+
+        ldr             w12, [x8, #8]          // offsets[1][0]
+        ldr             w14, [x8, #4]          // offsets[0][1]
+        ldr             w16, [x8, #12]         // offsets[1][1]
+        ldr             w8,  [x8]              // offsets[0][0]
+
+        mov             x10, #GRAIN_WIDTH*2    // grain_lut stride
+
+        add             x5,  x5,  #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6
+.if \sy
+        add             x5,  x5,  x10, lsl #2  // grain_lut += 4 * grain_stride
+        add             x5,  x5,  x10, lsl #1  // grain_lut += 2 * grain_stride
+.else
+        add             x5,  x5,  x10, lsl #3  // grain_lut += 8 * grain_stride
+        add             x5,  x5,  x10          // grain_lut += grain_stride
+.endif
+
+        calc_offset     w12, w13, w12, \sx, \sy
+        calc_offset     w14, w15, w14, \sx, \sy
+        calc_offset     w16, w17, w16, \sx, \sy
+        calc_offset     w8,  w11, w8,  \sx, \sy
+
+        add_offset      x13, w12, x13, x5,  x10
+        add_offset      x15, w14, x15, x5,  x10
+        add_offset      x17, w16, x17, x5,  x10
+        add_offset      x5,  w8,  x11, x5,  x10
+
+        add             x4,  x13, #2*(32 >> \sx)      // grain_lut += BLOCK_SIZE * bx
+        add             x8,  x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
+        add             x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
+        add             x11, x11, #2*(32 >> \sx)      // grain_lut += BLOCK_SIZE * bx
+
+        ldr             w13, [sp, #112]        // type
+
+        movrel          x16, overlap_coeffs_\sx
+        adr             x14, L(fguv_loop_sx\sx\()_tbl)
+
+        ld1             {v27.4h, v28.4h}, [x16] // overlap_coeffs
+        tst             w13, #1
+        ldrh            w13, [x14, w13, uxtw #1]
+
+        b.eq            1f
+        // y overlap
+        sub             w12, w9,  #(2 >> \sy)  // backup remaining h
+        mov             w9,  #(2 >> \sy)
+
+1:
+        sub             x13, x14, w13, uxtw
+
+.if \sy
+        movi            v25.8h,  #23
+        movi            v26.8h,  #22
+.else
+        movi            v25.8h,  #27
+        movi            v26.8h,  #17
+.endif
+
+.if \sy
+        add             x7,  x7,  x7           // luma_stride *= 2
+.endif
+
+        br              x13
+endfunc
+.endm
+
+fguv 420, 1, 1
+fguv 422, 1, 0
+fguv 444, 0, 0
+
+function fguv_loop_sx0_neon
+.macro fguv_loop_sx0 csfl, ox, oy
+L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
+        AARCH64_VALID_JUMP_TARGET
+1:
+.if \ox
+        ld1             {v4.4h}, [x4],  x10  // grain_lut old
+.endif
+.if \oy
+        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x8],  x10 // grain_lut top
+.endif
+.if \ox && \oy
+        ld1             {v5.4h}, [x11], x10  // grain_lut top old
+.endif
+        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x5],  x10 // grain_lut
+
+.if \ox
+        smull           v4.4s,   v4.4h,   v27.4h
+        smlal           v4.4s,   v16.4h,  v28.4h
+.endif
+
+.if \oy
+.if \ox
+        smull           v5.4s,   v5.4h,   v27.4h
+        smlal           v5.4s,   v0.4h,   v28.4h
+        sqrshrn         v4.4h,   v4.4s,   #5
+        sqrshrn         v5.4h,   v5.4s,   #5
+        smin            v4.4h,   v4.4h,   v15.4h
+        smin            v5.4h,   v5.4h,   v15.4h
+        smax            v4.4h,   v4.4h,   v14.4h
+        smax            v5.4h,   v5.4h,   v14.4h
+        ins             v16.d[0], v4.d[0]
+        ins             v0.d[0],  v5.d[0]
+.endif
+
+        smull           v6.4s,   v16.4h,  v26.4h
+        smull2          v7.4s,   v16.8h,  v26.8h
+        smull           v10.4s,  v17.4h,  v26.4h
+        smull2          v11.4s,  v17.8h,  v26.8h
+        smull           v16.4s,  v18.4h,  v26.4h
+        smull2          v17.4s,  v18.8h,  v26.8h
+        smull           v18.4s,  v19.4h,  v26.4h
+        smull2          v19.4s,  v19.8h,  v26.8h
+        smlal           v6.4s,   v0.4h,   v25.4h
+        smlal2          v7.4s,   v0.8h,   v25.8h
+        smlal           v10.4s,  v1.4h,   v25.4h
+        smlal2          v11.4s,  v1.8h,   v25.8h
+        smlal           v16.4s,  v2.4h,   v25.4h
+        smlal2          v17.4s,  v2.8h,   v25.8h
+        smlal           v18.4s,  v3.4h,   v25.4h
+        smlal2          v19.4s,  v3.8h,   v25.8h
+        sqrshrn         v6.4h,   v6.4s,   #5
+        sqrshrn2        v6.8h,   v7.4s,   #5
+        sqrshrn         v7.4h,   v10.4s,  #5
+        sqrshrn2        v7.8h,   v11.4s,  #5
+        sqrshrn         v10.4h,  v16.4s,  #5
+        sqrshrn2        v10.8h,  v17.4s,  #5
+        sqrshrn         v11.4h,  v18.4s,  #5
+        sqrshrn2        v11.8h,  v19.4s,  #5
+.endif
+
+.if \ox && !\oy
+        sqrshrn         v4.4h,   v4.4s,   #5
+        smin            v4.4h,   v4.4h,   v15.4h
+.endif
+        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x6],  x7 // luma
+.if \oy
+        smin            v16.8h,  v6.8h,   v15.8h
+        smin            v17.8h,  v7.8h,   v15.8h
+        smin            v18.8h,  v10.8h,  v15.8h
+        smin            v19.8h,  v11.8h,  v15.8h
+        smax            v16.8h,  v16.8h,  v14.8h
+        smax            v17.8h,  v17.8h,  v14.8h
+        smax            v18.8h,  v18.8h,  v14.8h
+        smax            v19.8h,  v19.8h,  v14.8h
+.endif
+
+.if \ox && !\oy
+        smax            v4.4h,   v4.4h,   v14.4h
+.endif
+        ld1             {v10.8h, v11.8h, v12.8h, v13.8h}, [x1],  x2 // src
+.if \ox && !\oy
+        ins             v16.d[0], v4.d[0]
+.endif
+
+.if !\csfl
+        smull           v4.4s,   v0.4h,   v8.4h
+        smull2          v5.4s,   v0.8h,   v8.8h
+        smull           v6.4s,   v1.4h,   v8.4h
+        smull2          v7.4s,   v1.8h,   v8.8h
+        smull           v0.4s,   v2.4h,   v8.4h
+        smull2          v1.4s,   v2.8h,   v8.8h
+        smull           v2.4s,   v3.4h,   v8.4h
+        smull2          v3.4s,   v3.8h,   v8.8h
+        smlal           v4.4s,   v10.4h,  v9.4h
+        smlal2          v5.4s,   v10.8h,  v9.8h
+        smlal           v6.4s,   v11.4h,  v9.4h
+        smlal2          v7.4s,   v11.8h,  v9.8h
+        smlal           v0.4s,   v12.4h,  v9.4h
+        smlal2          v1.4s,   v12.8h,  v9.8h
+        smlal           v2.4s,   v13.4h,  v9.4h
+        smlal2          v3.4s,   v13.8h,  v9.8h
+        shrn            v4.4h,   v4.4s,   #6
+        shrn2           v4.8h,   v5.4s,   #6
+        shrn            v5.4h,   v6.4s,   #6
+        shrn2           v5.8h,   v7.4s,   #6
+        shrn            v6.4h,   v0.4s,   #6
+        shrn2           v6.8h,   v1.4s,   #6
+        shrn            v7.4h,   v2.4s,   #6
+        shrn2           v7.8h,   v3.4s,   #6
+        add             v0.8h,   v4.8h,   v24.8h
+        add             v1.8h,   v5.8h,   v24.8h
+        add             v2.8h,   v6.8h,   v24.8h
+        add             v3.8h,   v7.8h,   v24.8h
+        movi            v20.8h,  #0
+        smin            v0.8h,   v0.8h,   v23.8h
+        smin            v1.8h,   v1.8h,   v23.8h
+        smin            v2.8h,   v2.8h,   v23.8h
+        smin            v3.8h,   v3.8h,   v23.8h
+        smax            v0.8h,   v0.8h,   v20.8h
+        smax            v1.8h,   v1.8h,   v20.8h
+        smax            v2.8h,   v2.8h,   v20.8h
+        smax            v3.8h,   v3.8h,   v20.8h
+.else
+        // Make sure that uninitialized pixels out of range past the right
+        // edge are in range; their actual values shouldn't matter.
+        and             v0.16b,  v0.16b,  v23.16b
+        and             v1.16b,  v1.16b,  v23.16b
+        and             v2.16b,  v2.16b,  v23.16b
+        and             v3.16b,  v3.16b,  v23.16b
+.endif
+
+        bl              gather32_neon
+
+        uxtl            v4.8h,   v6.8b            // scaling
+        uxtl2           v5.8h,   v6.16b
+        uxtl            v6.8h,   v7.8b
+        uxtl2           v7.8h,   v7.16b
+
+        ushl            v4.8h,   v4.8h,   v29.8h  // scaling << (15 - scaling_shift)
+        ushl            v5.8h,   v5.8h,   v29.8h
+        ushl            v6.8h,   v6.8h,   v29.8h
+        ushl            v7.8h,   v7.8h,   v29.8h
+
+        sqrdmulh        v16.8h,  v16.8h,  v4.8h   // round2((scaling << (15 - scaling_shift) * grain, 15)
+        sqrdmulh        v17.8h,  v17.8h,  v5.8h
+        sqrdmulh        v18.8h,  v18.8h,  v6.8h
+        sqrdmulh        v19.8h,  v19.8h,  v7.8h
+
+        usqadd          v10.8h,  v16.8h           // *src + noise
+        usqadd          v11.8h,  v17.8h
+        usqadd          v12.8h,  v18.8h
+        usqadd          v13.8h,  v19.8h
+
+        umax            v0.8h,   v10.8h,  v30.8h
+        umax            v1.8h,   v11.8h,  v30.8h
+        umax            v2.8h,   v12.8h,  v30.8h
+        umax            v3.8h,   v13.8h,  v30.8h
+        umin            v0.8h,   v0.8h,   v31.8h
+        umin            v1.8h,   v1.8h,   v31.8h
+        umin            v2.8h,   v2.8h,   v31.8h
+        umin            v3.8h,   v3.8h,   v31.8h
+
+        subs            w9,  w9,  #1
+.if \oy
+        dup             v25.8h,  v28.h[0]
+        dup             v26.8h,  v28.h[1]
+.endif
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h},  [x0], x2 // dst
+        b.gt            1b
+
+.if \oy
+        cmp             w12, #0
+        mov             w9,  w12               // restore actual remaining h
+        b.gt            L(fguv_loop_sx0_csfl\csfl\()_\ox\()0)
+.endif
+        b               9f
+.endm
+        fguv_loop_sx0   0, 0, 0
+        fguv_loop_sx0   0, 0, 1
+        fguv_loop_sx0   0, 1, 0
+        fguv_loop_sx0   0, 1, 1
+        fguv_loop_sx0   1, 0, 0
+        fguv_loop_sx0   1, 0, 1
+        fguv_loop_sx0   1, 1, 0
+        fguv_loop_sx0   1, 1, 1
+
+9:
+        ldp             d14, d15, [sp, #64]
+        ldp             d12, d13, [sp, #48]
+        ldp             d10, d11, [sp, #32]
+        ldp             d8,  d9,  [sp, #16]
+        ldr             x30,      [sp], #80
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+
+L(fguv_loop_sx0_tbl):
+        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00)
+        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01)
+        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10)
+        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11)
+        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00)
+        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01)
+        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10)
+        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11)
+endfunc
+
+function fguv_loop_sx1_neon
+.macro fguv_loop_sx1 csfl, ox, oy
+L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
+        AARCH64_VALID_JUMP_TARGET
+1:
+.if \ox
+        ld1             {v18.4h}, [x4],  x10  // grain_lut old
+.endif
+.if \oy
+        ld1             {v20.8h, v21.8h},  [x8],  x10 // grain_lut top
+.endif
+.if \ox && \oy
+        ld1             {v19.4h}, [x11], x10  // grain_lut top old
+.endif
+        ld1             {v16.8h, v17.8h}, [x5],  x10 // grain_lut
+
+.if \ox
+        smull           v18.4s,  v18.4h,  v27.4h
+        smlal           v18.4s,  v16.4h,  v28.4h
+.endif
+
+.if \oy
+.if \ox
+        smull           v19.4s,  v19.4h,  v27.4h
+        smlal           v19.4s,  v20.4h,  v28.4h
+        sqrshrn         v18.4h,  v18.4s,  #5
+        sqrshrn         v19.4h,  v19.4s,  #5
+        smin            v18.4h,  v18.4h,  v15.4h
+        smin            v19.4h,  v19.4h,  v15.4h
+        smax            v18.4h,  v18.4h,  v14.4h
+        smax            v19.4h,  v19.4h,  v14.4h
+        ins             v16.d[0], v18.d[0]
+        ins             v20.d[0], v19.d[0]
+.endif
+
+        smull           v0.4s,   v16.4h,  v26.4h
+        smull2          v1.4s,   v16.8h,  v26.8h
+        smull           v2.4s,   v17.4h,  v26.4h
+        smull2          v3.4s,   v17.8h,  v26.8h
+        smlal           v0.4s,   v20.4h,  v25.4h
+        smlal2          v1.4s,   v20.8h,  v25.8h
+        smlal           v2.4s,   v21.4h,  v25.4h
+        smlal2          v3.4s,   v21.8h,  v25.8h
+        sqrshrn         v16.4h,  v0.4s,   #5
+        sqrshrn2        v16.8h,  v1.4s,   #5
+        sqrshrn         v17.4h,  v2.4s,   #5
+        sqrshrn2        v17.8h,  v3.4s,   #5
+.endif
+
+.if \ox && !\oy
+        sqrshrn         v18.4h,  v18.4s,  #5
+        smin            v18.4h,  v18.4h,  v15.4h
+.endif
+        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x6],  x7 // luma
+.if \oy
+        smin            v16.8h,  v16.8h,  v15.8h
+        smin            v17.8h,  v17.8h,  v15.8h
+        smax            v16.8h,  v16.8h,  v14.8h
+        smax            v17.8h,  v17.8h,  v14.8h
+.endif
+
+.if \ox && !\oy
+        smax            v18.4h,  v18.4h,  v14.4h
+.endif
+        ld1             {v10.8h, v11.8h},  [x1],  x2 // src
+.if \ox && !\oy
+        ins             v16.d[0], v18.d[0]
+.endif
+        addp            v0.8h,   v0.8h,   v1.8h
+        addp            v1.8h,   v2.8h,   v3.8h
+        urshr           v0.8h,   v0.8h,   #1
+        urshr           v1.8h,   v1.8h,   #1
+.if !\csfl
+        smull           v2.4s,   v0.4h,   v8.4h
+        smull2          v3.4s,   v0.8h,   v8.8h
+        smull           v0.4s,   v1.4h,   v8.4h
+        smull2          v1.4s,   v1.8h,   v8.8h
+        smlal           v2.4s,   v10.4h,  v9.4h
+        smlal2          v3.4s,   v10.8h,  v9.8h
+        smlal           v0.4s,   v11.4h,  v9.4h
+        smlal2          v1.4s,   v11.8h,  v9.8h
+        shrn            v2.4h,   v2.4s,   #6
+        shrn2           v2.8h,   v3.4s,   #6
+        shrn            v3.4h,   v0.4s,   #6
+        shrn2           v3.8h,   v1.4s,   #6
+        add             v0.8h,   v2.8h,   v24.8h
+        add             v1.8h,   v3.8h,   v24.8h
+        movi            v2.8h,   #0
+        smin            v0.8h,   v0.8h,   v23.8h
+        smin            v1.8h,   v1.8h,   v23.8h
+        smax            v0.8h,   v0.8h,   v2.8h
+        smax            v1.8h,   v1.8h,   v2.8h
+.else
+        // Make sure that uninitialized pixels out of range past the right
+        // edge are in range; their actual values shouldn't matter.
+        and             v0.16b,  v0.16b,  v23.16b
+        and             v1.16b,  v1.16b,  v23.16b
+.endif
+
+        bl              gather16_neon
+
+        uxtl            v4.8h,   v6.8b            // scaling
+        uxtl2           v5.8h,   v6.16b
+
+        ushl            v4.8h,   v4.8h,   v29.8h  // scaling << (15 - scaling_shift)
+        ushl            v5.8h,   v5.8h,   v29.8h
+
+        sqrdmulh        v16.8h,  v16.8h,  v4.8h   // round2((scaling << (15 - scaling_shift) * grain, 15)
+        sqrdmulh        v17.8h,  v17.8h,  v5.8h
+
+        usqadd          v10.8h,  v16.8h           // *src + noise
+        usqadd          v11.8h,  v17.8h
+
+        umax            v0.8h,   v10.8h,  v30.8h
+        umax            v1.8h,   v11.8h,  v30.8h
+        umin            v0.8h,   v0.8h,   v31.8h
+        umin            v1.8h,   v1.8h,   v31.8h
+
+.if \oy
+        mov             v16.16b, v25.16b
+.endif
+        subs            w9,  w9,  #1
+.if \oy
+        mov             v25.16b, v26.16b
+        mov             v26.16b, v16.16b
+.endif
+        st1             {v0.8h, v1.8h},  [x0], x2 // dst
+        b.gt            1b
+
+.if \oy
+        cmp             w12, #0
+        mov             w9,  w12               // restore actual remaining h
+        b.gt            L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
+.endif
+
+        b               9f
+.endm
+        fguv_loop_sx1   0, 0, 0
+        fguv_loop_sx1   0, 0, 1
+        fguv_loop_sx1   0, 1, 0
+        fguv_loop_sx1   0, 1, 1
+        fguv_loop_sx1   1, 0, 0
+        fguv_loop_sx1   1, 0, 1
+        fguv_loop_sx1   1, 1, 0
+        fguv_loop_sx1   1, 1, 1
+
+9:
+        ldp             d14, d15, [sp, #64]
+        ldp             d12, d13, [sp, #48]
+        ldp             d10, d11, [sp, #32]
+        ldp             d8,  d9,  [sp, #16]
+        ldr             x30,      [sp], #80
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+
+L(fguv_loop_sx1_tbl):
+        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00)
+        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01)
+        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10)
+        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11)
+        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00)
+        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01)
+        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10)
+        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11)
+endfunc
diff --git a/third_party/dav1d/src/arm/64/itx.S b/third_party/dav1d/src/arm/64/itx.S
index ec932af0ce66d..c9650e9d544b7 100644
--- a/third_party/dav1d/src/arm/64/itx.S
+++ b/third_party/dav1d/src/arm/64/itx.S
@@ -133,10 +133,10 @@ endconst
 .endif
 .endm
 
-.macro rshrn_sz d0, s0, s1, shift, sz
-        rshrn           \d0\().4h, \s0\().4s, \shift
+.macro sqrshrn_sz d0, s0, s1, shift, sz
+        sqrshrn         \d0\().4h, \s0\().4s, \shift
 .ifc \sz, .8h
-        rshrn2          \d0\().8h, \s1\().4s, \shift
+        sqrshrn2        \d0\().8h, \s1\().4s, \shift
 .endif
 .endm
 
@@ -438,11 +438,11 @@ endfunc
         smull_smlal     v6,  v7,  \r1, \r3, v0.h[3], v0.h[2], \sz
         smull_smlsl     v4,  v5,  \r1, \r3, v0.h[2], v0.h[3], \sz
         smull_smlal     v2,  v3,  \r0, \r2, v0.h[0], v0.h[0], \sz
-        rshrn_sz        v6,  v6,  v7,  #12, \sz
-        rshrn_sz        v7,  v4,  v5,  #12, \sz
+        sqrshrn_sz      v6,  v6,  v7,  #12, \sz
+        sqrshrn_sz      v7,  v4,  v5,  #12, \sz
         smull_smlsl     v4,  v5,  \r0, \r2, v0.h[0], v0.h[0], \sz
-        rshrn_sz        v2,  v2,  v3,  #12, \sz
-        rshrn_sz        v3,  v4,  v5,  #12, \sz
+        sqrshrn_sz      v2,  v2,  v3,  #12, \sz
+        sqrshrn_sz      v3,  v4,  v5,  #12, \sz
         sqadd           \r0\sz,  v2\sz,   v6\sz
         sqsub           \r3\sz,  v2\sz,   v6\sz
         sqadd           \r1\sz,  v3\sz,   v7\sz
@@ -714,11 +714,11 @@ def_fn_4x4 identity, flipadst
         smull_smlsl     v2,  v3,  \r1, \r7, v0.h[4], v0.h[5], \sz // -> t4a
         smull_smlal     v4,  v5,  \r1, \r7, v0.h[5], v0.h[4], \sz // -> t7a
         smull_smlsl     v6,  v7,  \r5, \r3, v0.h[6], v0.h[7], \sz // -> t5a
-        rshrn_sz        \r1, v2,  v3,  #12, \sz                   // t4a
-        rshrn_sz        \r7, v4,  v5,  #12, \sz                   // t7a
+        sqrshrn_sz      \r1, v2,  v3,  #12, \sz                   // t4a
+        sqrshrn_sz      \r7, v4,  v5,  #12, \sz                   // t7a
         smull_smlal     v2,  v3,  \r5, \r3, v0.h[7], v0.h[6], \sz // -> t6a
-        rshrn_sz        \r3, v6,  v7,  #12, \sz                   // t5a
-        rshrn_sz        \r5, v2,  v3,  #12, \sz                   // t6a
+        sqrshrn_sz      \r3, v6,  v7,  #12, \sz                   // t5a
+        sqrshrn_sz      \r5, v2,  v3,  #12, \sz                   // t6a
 
         sqadd           v2\sz,   \r1\sz,  \r3\sz // t4
         sqsub           \r1\sz,  \r1\sz,  \r3\sz // t5a
@@ -727,8 +727,8 @@ def_fn_4x4 identity, flipadst
 
         smull_smlsl     v4,  v5,  \r3, \r1, v0.h[0], v0.h[0], \sz // -> t5
         smull_smlal     v6,  v7,  \r3, \r1, v0.h[0], v0.h[0], \sz // -> t6
-        rshrn_sz        v4,  v4,  v5,  #12, \sz // t5
-        rshrn_sz        v5,  v6,  v7,  #12, \sz // t6
+        sqrshrn_sz      v4,  v4,  v5,  #12, \sz // t5
+        sqrshrn_sz      v5,  v6,  v7,  #12, \sz // t6
 
         sqsub           \r7\sz,  \r0\sz,  v3\sz // out7
         sqadd           \r0\sz,  \r0\sz,  v3\sz // out0
@@ -762,19 +762,19 @@ endfunc
         smull_smlal     v2,  v3,  v23, v16, v0.h[0], v0.h[1], \sz
         smull_smlsl     v4,  v5,  v23, v16, v0.h[1], v0.h[0], \sz
         smull_smlal     v6,  v7,  v21, v18, v0.h[2], v0.h[3], \sz
-        rshrn_sz        v16, v2,  v3,  #12, \sz  // t0a
-        rshrn_sz        v23, v4,  v5,  #12, \sz  // t1a
+        sqrshrn_sz      v16, v2,  v3,  #12, \sz  // t0a
+        sqrshrn_sz      v23, v4,  v5,  #12, \sz  // t1a
         smull_smlsl     v2,  v3,  v21, v18, v0.h[3], v0.h[2], \sz
         smull_smlal     v4,  v5,  v19, v20, v0.h[4], v0.h[5], \sz
-        rshrn_sz        v18, v6,  v7,  #12, \sz  // t2a
-        rshrn_sz        v21, v2,  v3,  #12, \sz  // t3a
+        sqrshrn_sz      v18, v6,  v7,  #12, \sz  // t2a
+        sqrshrn_sz      v21, v2,  v3,  #12, \sz  // t3a
         smull_smlsl     v6,  v7,  v19, v20, v0.h[5], v0.h[4], \sz
         smull_smlal     v2,  v3,  v17, v22, v0.h[6], v0.h[7], \sz
-        rshrn_sz        v20, v4,  v5,  #12, \sz  // t4a
-        rshrn_sz        v19, v6,  v7,  #12, \sz  // t5a
+        sqrshrn_sz      v20, v4,  v5,  #12, \sz  // t4a
+        sqrshrn_sz      v19, v6,  v7,  #12, \sz  // t5a
         smull_smlsl     v4,  v5,  v17, v22, v0.h[7], v0.h[6], \sz
-        rshrn_sz        v22, v2,  v3,  #12, \sz  // t6a
-        rshrn_sz        v17, v4,  v5,  #12, \sz  // t7a
+        sqrshrn_sz      v22, v2,  v3,  #12, \sz  // t6a
+        sqrshrn_sz      v17, v4,  v5,  #12, \sz  // t7a
 
         sqadd           v2\sz,   v16\sz,  v20\sz // t0
         sqsub           v3\sz,   v16\sz,  v20\sz // t4
@@ -789,13 +789,13 @@ endfunc
         smull_smlsl     v20, v21, v3,  v5,  v1.h[2], v1.h[3], \sz
         smull_smlsl     v22, v23, v19, v7,  v1.h[3], v1.h[2], \sz
 
-        rshrn_sz        v3,  v16, v17, #12, \sz  // t4a
-        rshrn_sz        v5,  v20, v21, #12, \sz  // t5a
+        sqrshrn_sz      v3,  v16, v17, #12, \sz  // t4a
+        sqrshrn_sz      v5,  v20, v21, #12, \sz  // t5a
 
         smull_smlal     v16, v17, v19, v7,  v1.h[2], v1.h[3], \sz
 
-        rshrn_sz        v7,  v22, v23, #12, \sz  // t6a
-        rshrn_sz        v19, v16, v17, #12, \sz  // t7a
+        sqrshrn_sz      v7,  v22, v23, #12, \sz  // t6a
+        sqrshrn_sz      v19, v16, v17, #12, \sz  // t7a
 
         sqadd           \o0\()\sz, v2\sz, v6\sz  // out0
         sqsub           v2\sz,     v2\sz, v6\sz  // t2
@@ -812,11 +812,11 @@ endfunc
         smull_smlal     v18, v19, v2,  v4,  v1.h[0], v1.h[0], \sz // -> out3 (v19 or v20)
         smull_smlsl     v6,  v7,  v2,  v4,  v1.h[0], v1.h[0], \sz // -> out4 (v20 or v19)
         smull_smlsl     v20, v21, v3,  v5,  v1.h[0], v1.h[0], \sz // -> out5 (v21 or v18)
-        rshrn_sz        v2,  v18, v19, #12, \sz // out3
+        sqrshrn_sz      v2,  v18, v19, #12, \sz // out3
         smull_smlal     v18, v19, v3,  v5,  v1.h[0], v1.h[0], \sz // -> out2 (v18 or v21)
-        rshrn_sz        v3,  v20, v21, #12, \sz // out5
-        rshrn_sz        \o2, v18, v19, #12, \sz // out2 (v18 or v21)
-        rshrn_sz        \o4, v6,  v7,  #12, \sz // out4 (v20 or v19)
+        sqrshrn_sz      v3,  v20, v21, #12, \sz // out5
+        sqrshrn_sz      \o2, v18, v19, #12, \sz // out2 (v18 or v21)
+        sqrshrn_sz      \o4, v6,  v7,  #12, \sz // out4 (v20 or v19)
 
         sqneg           \o3\()\sz, v2\sz     // out3
         sqneg           \o5\()\sz, v3\sz     // out5
@@ -1033,19 +1033,19 @@ def_fns_48 8, 4
         smull_smlsl     v2,  v3,  v17, v31, v1.h[0], v1.h[1], \sz // -> t8a
         smull_smlal     v4,  v5,  v17, v31, v1.h[1], v1.h[0], \sz // -> t15a
         smull_smlsl     v6,  v7,  v25, v23, v1.h[2], v1.h[3], \sz // -> t9a
-        rshrn_sz        v17, v2,  v3,  #12, \sz                   // t8a
-        rshrn_sz        v31, v4,  v5,  #12, \sz                   // t15a
+        sqrshrn_sz      v17, v2,  v3,  #12, \sz                   // t8a
+        sqrshrn_sz      v31, v4,  v5,  #12, \sz                   // t15a
         smull_smlal     v2,  v3,  v25, v23, v1.h[3], v1.h[2], \sz // -> t14a
         smull_smlsl     v4,  v5,  v21, v27, v1.h[4], v1.h[5], \sz // -> t10a
-        rshrn_sz        v23, v6,  v7,  #12, \sz                   // t9a
-        rshrn_sz        v25, v2,  v3,  #12, \sz                   // t14a
+        sqrshrn_sz      v23, v6,  v7,  #12, \sz                   // t9a
+        sqrshrn_sz      v25, v2,  v3,  #12, \sz                   // t14a
         smull_smlal     v6,  v7,  v21, v27, v1.h[5], v1.h[4], \sz // -> t13a
         smull_smlsl     v2,  v3,  v29, v19, v1.h[6], v1.h[7], \sz // -> t11a
-        rshrn_sz        v21, v4,  v5,  #12, \sz                   // t10a
-        rshrn_sz        v27, v6,  v7,  #12, \sz                   // t13a
+        sqrshrn_sz      v21, v4,  v5,  #12, \sz                   // t10a
+        sqrshrn_sz      v27, v6,  v7,  #12, \sz                   // t13a
         smull_smlal     v4,  v5,  v29, v19, v1.h[7], v1.h[6], \sz // -> t12a
-        rshrn_sz        v19, v2,  v3,  #12, \sz                   // t11a
-        rshrn_sz        v29, v4,  v5,  #12, \sz                   // t12a
+        sqrshrn_sz      v19, v2,  v3,  #12, \sz                   // t11a
+        sqrshrn_sz      v29, v4,  v5,  #12, \sz                   // t12a
 
         sqsub           v2\sz,   v17\sz,  v23\sz  // t9
         sqadd           v17\sz,  v17\sz,  v23\sz  // t8
@@ -1058,17 +1058,17 @@ def_fns_48 8, 4
 
         smull_smlsl     v4,  v5,  v3,  v2,  v0.h[2], v0.h[3], \sz // -> t9a
         smull_smlal     v6,  v7,  v3,  v2,  v0.h[3], v0.h[2], \sz // -> t14a
-        rshrn_sz        v21, v4,  v5,  #12, \sz                   // t9a
-        rshrn_sz        v27, v6,  v7,  #12, \sz                   // t14a
+        sqrshrn_sz      v21, v4,  v5,  #12, \sz                   // t9a
+        sqrshrn_sz      v27, v6,  v7,  #12, \sz                   // t14a
 
         smull_smlsl     v4,  v5,  v29, v23, v0.h[2], v0.h[3], \sz // -> t13a
         smull_smlal     v6,  v7,  v29, v23, v0.h[3], v0.h[2], \sz // -> t10a
-        rshrn_sz        v29, v4,  v5,  #12, \sz                   // t13a
+        sqrshrn_sz      v29, v4,  v5,  #12, \sz                   // t13a
         neg             v6.4s,   v6.4s
 .ifc \sz, .8h
         neg             v7.4s,   v7.4s
 .endif
-        rshrn_sz        v23, v6,  v7,  #12, \sz                   // t10a
+        sqrshrn_sz      v23, v6,  v7,  #12, \sz                   // t10a
 
         sqsub           v2\sz,   v17\sz,  v19\sz  // t11a
         sqadd           v17\sz,  v17\sz,  v19\sz  // t8a
@@ -1083,11 +1083,11 @@ def_fns_48 8, 4
         smull_smlal     v6,  v7,  v3,  v2,  v0.h[0], v0.h[0], \sz // -> t12
         smull_smlsl     v2,  v3,  v25, v21, v0.h[0], v0.h[0], \sz // -> t10a
 
-        rshrn_sz        v4,  v4,  v5,  #12, \sz   // t11
-        rshrn_sz        v5,  v6,  v7,  #12, \sz   // t12
+        sqrshrn_sz      v4,  v4,  v5,  #12, \sz   // t11
+        sqrshrn_sz      v5,  v6,  v7,  #12, \sz   // t12
         smull_smlal     v6,  v7,  v25, v21, v0.h[0], v0.h[0], \sz // -> t13a
-        rshrn_sz        v2,  v2,  v3,  #12, \sz   // t10a
-        rshrn_sz        v3,  v6,  v7,  #12, \sz   // t13a
+        sqrshrn_sz      v2,  v2,  v3,  #12, \sz   // t10a
+        sqrshrn_sz      v3,  v6,  v7,  #12, \sz   // t13a
 
         sqadd           v6\sz,   v16\sz,  v31\sz  // out0
         sqsub           v31\sz,  v16\sz,  v31\sz  // out15
@@ -1132,35 +1132,35 @@ endfunc
         smull_smlal     v2,  v3,  v31, v16, v0.h[0], v0.h[1], \sz // -> t0
         smull_smlsl     v4,  v5,  v31, v16, v0.h[1], v0.h[0], \sz // -> t1
         smull_smlal     v6,  v7,  v29, v18, v0.h[2], v0.h[3], \sz // -> t2
-        rshrn_sz        v16, v2,  v3,  #12, \sz   // t0
-        rshrn_sz        v31, v4,  v5,  #12, \sz   // t1
+        sqrshrn_sz      v16, v2,  v3,  #12, \sz   // t0
+        sqrshrn_sz      v31, v4,  v5,  #12, \sz   // t1
         smull_smlsl     v2,  v3,  v29, v18, v0.h[3], v0.h[2], \sz // -> t3
         smull_smlal     v4,  v5,  v27, v20, v0.h[4], v0.h[5], \sz // -> t4
-        rshrn_sz        v18, v6,  v7,  #12, \sz   // t2
-        rshrn_sz        v29, v2,  v3,  #12, \sz   // t3
+        sqrshrn_sz      v18, v6,  v7,  #12, \sz   // t2
+        sqrshrn_sz      v29, v2,  v3,  #12, \sz   // t3
         smull_smlsl     v6,  v7,  v27, v20, v0.h[5], v0.h[4], \sz // -> t5
         smull_smlal     v2,  v3,  v25, v22, v0.h[6], v0.h[7], \sz // -> t6
-        rshrn_sz        v20, v4,  v5,  #12, \sz   // t4
-        rshrn_sz        v27, v6,  v7,  #12, \sz   // t5
+        sqrshrn_sz      v20, v4,  v5,  #12, \sz   // t4
+        sqrshrn_sz      v27, v6,  v7,  #12, \sz   // t5
         smull_smlsl     v4,  v5,  v25, v22, v0.h[7], v0.h[6], \sz // -> t7
         smull_smlal     v6,  v7,  v23, v24, v1.h[0], v1.h[1], \sz // -> t8
-        rshrn_sz        v22, v2,  v3,  #12, \sz   // t6
-        rshrn_sz        v25, v4,  v5,  #12, \sz   // t7
+        sqrshrn_sz      v22, v2,  v3,  #12, \sz   // t6
+        sqrshrn_sz      v25, v4,  v5,  #12, \sz   // t7
         smull_smlsl     v2,  v3,  v23, v24, v1.h[1], v1.h[0], \sz // -> t9
         smull_smlal     v4,  v5,  v21, v26, v1.h[2], v1.h[3], \sz // -> t10
-        rshrn_sz        v23, v6,  v7,  #12, \sz   // t8
-        rshrn_sz        v24, v2,  v3,  #12, \sz   // t9
+        sqrshrn_sz      v23, v6,  v7,  #12, \sz   // t8
+        sqrshrn_sz      v24, v2,  v3,  #12, \sz   // t9
         smull_smlsl     v6,  v7,  v21, v26, v1.h[3], v1.h[2], \sz // -> t11
         smull_smlal     v2,  v3,  v19, v28, v1.h[4], v1.h[5], \sz // -> t12
-        rshrn_sz        v21, v4,  v5,  #12, \sz   // t10
-        rshrn_sz        v26, v6,  v7,  #12, \sz   // t11
+        sqrshrn_sz      v21, v4,  v5,  #12, \sz   // t10
+        sqrshrn_sz      v26, v6,  v7,  #12, \sz   // t11
         smull_smlsl     v4,  v5,  v19, v28, v1.h[5], v1.h[4], \sz // -> t13
         smull_smlal     v6,  v7,  v17, v30, v1.h[6], v1.h[7], \sz // -> t14
-        rshrn_sz        v19, v2,  v3,  #12, \sz   // t12
-        rshrn_sz        v28, v4,  v5,  #12, \sz   // t13
+        sqrshrn_sz      v19, v2,  v3,  #12, \sz   // t12
+        sqrshrn_sz      v28, v4,  v5,  #12, \sz   // t13
         smull_smlsl     v2,  v3,  v17, v30, v1.h[7], v1.h[6], \sz // -> t15
-        rshrn_sz        v17, v6,  v7,  #12, \sz   // t14
-        rshrn_sz        v30, v2,  v3,  #12, \sz   // t15
+        sqrshrn_sz      v17, v6,  v7,  #12, \sz   // t14
+        sqrshrn_sz      v30, v2,  v3,  #12, \sz   // t15
 
         ld1             {v0.8h}, [x16]
 
@@ -1184,19 +1184,19 @@ endfunc
         smull_smlal     v4,  v5,  v2,  v3,  v0.h[5], v0.h[4], \sz // -> t8
         smull_smlsl     v6,  v7,  v2,  v3,  v0.h[4], v0.h[5], \sz // -> t9
         smull_smlal     v2,  v3,  v18, v29, v0.h[7], v0.h[6], \sz // -> t10
-        rshrn_sz        v17, v4,  v5,  #12, \sz  // t8
-        rshrn_sz        v30, v6,  v7,  #12, \sz  // t9
+        sqrshrn_sz      v17, v4,  v5,  #12, \sz  // t8
+        sqrshrn_sz      v30, v6,  v7,  #12, \sz  // t9
         smull_smlsl     v4,  v5,  v18, v29, v0.h[6], v0.h[7], \sz // -> t11
         smull_smlsl     v6,  v7,  v27, v20, v0.h[5], v0.h[4], \sz // -> t12
-        rshrn_sz        v18, v2,  v3,  #12, \sz  // t10
-        rshrn_sz        v29, v4,  v5,  #12, \sz  // t11
+        sqrshrn_sz      v18, v2,  v3,  #12, \sz  // t10
+        sqrshrn_sz      v29, v4,  v5,  #12, \sz  // t11
         smull_smlal     v2,  v3,  v27, v20, v0.h[4], v0.h[5], \sz // -> t13
         smull_smlsl     v4,  v5,  v25, v22, v0.h[7], v0.h[6], \sz // -> t14
-        rshrn_sz        v27, v6,  v7,  #12, \sz  // t12
-        rshrn_sz        v20, v2,  v3,  #12, \sz  // t13
+        sqrshrn_sz      v27, v6,  v7,  #12, \sz  // t12
+        sqrshrn_sz      v20, v2,  v3,  #12, \sz  // t13
         smull_smlal     v6,  v7,  v25, v22, v0.h[6], v0.h[7], \sz // -> t15
-        rshrn_sz        v25, v4,  v5,  #12, \sz  // t14
-        rshrn_sz        v22, v6,  v7,  #12, \sz  // t15
+        sqrshrn_sz      v25, v4,  v5,  #12, \sz  // t14
+        sqrshrn_sz      v22, v6,  v7,  #12, \sz  // t15
 
         sqsub           v2\sz,   v16\sz,  v21\sz // t4
         sqadd           v16\sz,  v16\sz,  v21\sz // t0
@@ -1218,19 +1218,19 @@ endfunc
         smull_smlal     v4,  v5,  v2,  v3,  v0.h[3], v0.h[2], \sz // -> t4a
         smull_smlsl     v6,  v7,  v2,  v3,  v0.h[2], v0.h[3], \sz // -> t5a
         smull_smlsl     v2,  v3,  v24, v23, v0.h[3], v0.h[2], \sz // -> t6a
-        rshrn_sz        v22, v4,  v5,  #12, \sz // t4a
-        rshrn_sz        v25, v6,  v7,  #12, \sz // t5a
+        sqrshrn_sz      v22, v4,  v5,  #12, \sz // t4a
+        sqrshrn_sz      v25, v6,  v7,  #12, \sz // t5a
         smull_smlal     v4,  v5,  v24, v23, v0.h[2], v0.h[3], \sz // -> t7a
         smull_smlal     v6,  v7,  v17, v30, v0.h[3], v0.h[2], \sz // -> t12
-        rshrn_sz        v24, v2,  v3,  #12, \sz // t6a
-        rshrn_sz        v23, v4,  v5,  #12, \sz // t7a
+        sqrshrn_sz      v24, v2,  v3,  #12, \sz // t6a
+        sqrshrn_sz      v23, v4,  v5,  #12, \sz // t7a
         smull_smlsl     v2,  v3,  v17, v30, v0.h[2], v0.h[3], \sz // -> t13
         smull_smlsl     v4,  v5,  v29, v18, v0.h[3], v0.h[2], \sz // -> t14
-        rshrn_sz        v17, v6,  v7,  #12, \sz // t12
+        sqrshrn_sz      v17, v6,  v7,  #12, \sz // t12
         smull_smlal     v6,  v7,  v29, v18, v0.h[2], v0.h[3], \sz // -> t15
-        rshrn_sz        v29, v2,  v3,  #12, \sz // t13
-        rshrn_sz        v30, v4,  v5,  #12, \sz // t14
-        rshrn_sz        v18, v6,  v7,  #12, \sz // t15
+        sqrshrn_sz      v29, v2,  v3,  #12, \sz // t13
+        sqrshrn_sz      v30, v4,  v5,  #12, \sz // t14
+        sqrshrn_sz      v18, v6,  v7,  #12, \sz // t15
 
         sqsub           v2\sz,   v16\sz,  v21\sz // t2a
 .ifc \o0, v16
@@ -1267,21 +1267,21 @@ endfunc
         smull_smlal     v4,  v5,  v2,  v21, v0.h[0], v0.h[0], \sz // -> out7 (v23 or v24)
         smull_smlal     v6,  v7,  v26, v3,  v0.h[0], v0.h[0], \sz // -> out5 (v21 or v26)
 
-        rshrn_sz        v24, v24, v25, #12, \sz // out8
-        rshrn_sz        v4,  v4,  v5,  #12, \sz // out7
-        rshrn_sz        v5,  v6,  v7,  #12, \sz // out5
+        sqrshrn_sz      v24, v24, v25, #12, \sz // out8
+        sqrshrn_sz      v4,  v4,  v5,  #12, \sz // out7
+        sqrshrn_sz      v5,  v6,  v7,  #12, \sz // out5
         smull_smlsl     v6,  v7,  v26, v3,  v0.h[0], v0.h[0], \sz // -> out10 (v26 or v21)
         smull_smlal     v2,  v3,  v22, v23, v0.h[0], v0.h[0], \sz // -> out4 (v20 or v27)
-        rshrn_sz        v26, v6,  v7,  #12, \sz // out10
+        sqrshrn_sz      v26, v6,  v7,  #12, \sz // out10
 
         smull_smlsl     v6,  v7,  v22, v23, v0.h[0], v0.h[0], \sz // -> out11 (v27 or v20)
         smull_smlal     v22, v23, v27, v20, v0.h[0], v0.h[0], \sz // -> out6 (v22 or v25)
         smull_smlsl     v21, v25, v27, v20, v0.h[0], v0.h[0], \sz // -> out9 (v25 or v22)
 
-        rshrn_sz        \o4, v2,  v3,  #12, \sz // out4
-        rshrn_sz        v6,  v6,  v7,  #12, \sz // out11
-        rshrn_sz        v7,  v21, v25, #12, \sz // out9
-        rshrn_sz        \o6, v22, v23, #12, \sz // out6
+        sqrshrn_sz      \o4, v2,  v3,  #12, \sz // out4
+        sqrshrn_sz      v6,  v6,  v7,  #12, \sz // out11
+        sqrshrn_sz      v7,  v21, v25, #12, \sz // out9
+        sqrshrn_sz      \o6, v22, v23, #12, \sz // out6
 
 .ifc \o8, v23
         mov             \o8\szb,  v24\szb
@@ -1860,35 +1860,35 @@ function inv_dct32_odd_8h_x16_neon, export=1
         smull_smlsl     v2,  v3,  v16, v31, v0.h[0], v0.h[1], .8h // -> t16a
         smull_smlal     v4,  v5,  v16, v31, v0.h[1], v0.h[0], .8h // -> t31a
         smull_smlsl     v6,  v7,  v24, v23, v0.h[2], v0.h[3], .8h // -> t17a
-        rshrn_sz        v16, v2,  v3,  #12, .8h                   // t16a
-        rshrn_sz        v31, v4,  v5,  #12, .8h                   // t31a
+        sqrshrn_sz      v16, v2,  v3,  #12, .8h                   // t16a
+        sqrshrn_sz      v31, v4,  v5,  #12, .8h                   // t31a
         smull_smlal     v2,  v3,  v24, v23, v0.h[3], v0.h[2], .8h // -> t30a
         smull_smlsl     v4,  v5,  v20, v27, v0.h[4], v0.h[5], .8h // -> t18a
-        rshrn_sz        v24, v6,  v7,  #12, .8h                   // t17a
-        rshrn_sz        v23, v2,  v3,  #12, .8h                   // t30a
+        sqrshrn_sz      v24, v6,  v7,  #12, .8h                   // t17a
+        sqrshrn_sz      v23, v2,  v3,  #12, .8h                   // t30a
         smull_smlal     v6,  v7,  v20, v27, v0.h[5], v0.h[4], .8h // -> t29a
         smull_smlsl     v2,  v3,  v28, v19, v0.h[6], v0.h[7], .8h // -> t19a
-        rshrn_sz        v20, v4,  v5,  #12, .8h                   // t18a
-        rshrn_sz        v27, v6,  v7,  #12, .8h                   // t29a
+        sqrshrn_sz      v20, v4,  v5,  #12, .8h                   // t18a
+        sqrshrn_sz      v27, v6,  v7,  #12, .8h                   // t29a
         smull_smlal     v4,  v5,  v28, v19, v0.h[7], v0.h[6], .8h // -> t28a
         smull_smlsl     v6,  v7,  v18, v29, v1.h[0], v1.h[1], .8h // -> t20a
-        rshrn_sz        v28, v2,  v3,  #12, .8h                   // t19a
-        rshrn_sz        v19, v4,  v5,  #12, .8h                   // t28a
+        sqrshrn_sz      v28, v2,  v3,  #12, .8h                   // t19a
+        sqrshrn_sz      v19, v4,  v5,  #12, .8h                   // t28a
         smull_smlal     v2,  v3,  v18, v29, v1.h[1], v1.h[0], .8h // -> t27a
         smull_smlsl     v4,  v5,  v26, v21, v1.h[2], v1.h[3], .8h // -> t21a
-        rshrn_sz        v18, v6,  v7,  #12, .8h                   // t20a
-        rshrn_sz        v29, v2,  v3,  #12, .8h                   // t27a
+        sqrshrn_sz      v18, v6,  v7,  #12, .8h                   // t20a
+        sqrshrn_sz      v29, v2,  v3,  #12, .8h                   // t27a
         smull_smlal     v6,  v7,  v26, v21, v1.h[3], v1.h[2], .8h // -> t26a
         smull_smlsl     v2,  v3,  v22, v25, v1.h[4], v1.h[5], .8h // -> t22a
-        rshrn_sz        v26, v4,  v5,  #12, .8h                   // t21a
-        rshrn_sz        v21, v6,  v7,  #12, .8h                   // t26a
+        sqrshrn_sz      v26, v4,  v5,  #12, .8h                   // t21a
+        sqrshrn_sz      v21, v6,  v7,  #12, .8h                   // t26a
         smull_smlal     v4,  v5,  v22, v25, v1.h[5], v1.h[4], .8h // -> t25a
         smull_smlsl     v6,  v7,  v30, v17, v1.h[6], v1.h[7], .8h // -> t23a
-        rshrn_sz        v22, v2,  v3,  #12, .8h                   // t22a
-        rshrn_sz        v25, v4,  v5,  #12, .8h                   // t25a
+        sqrshrn_sz      v22, v2,  v3,  #12, .8h                   // t22a
+        sqrshrn_sz      v25, v4,  v5,  #12, .8h                   // t25a
         smull_smlal     v2,  v3,  v30, v17, v1.h[7], v1.h[6], .8h // -> t24a
-        rshrn_sz        v30, v6,  v7,  #12, .8h                   // t23a
-        rshrn_sz        v17, v2,  v3,  #12, .8h                   // t24a
+        sqrshrn_sz      v30, v6,  v7,  #12, .8h                   // t23a
+        sqrshrn_sz      v17, v2,  v3,  #12, .8h                   // t24a
 
         ld1             {v0.8h}, [x16]
 
@@ -1912,23 +1912,23 @@ function inv_dct32_odd_8h_x16_neon, export=1
         smull_smlsl     v4,  v5,  v3,  v2,  v0.h[4], v0.h[5], .8h // -> t17a
         smull_smlal     v6,  v7,  v3,  v2,  v0.h[5], v0.h[4], .8h // -> t30a
         smull_smlal     v2,  v3,  v19, v24, v0.h[5], v0.h[4], .8h // -> t18a
-        rshrn_sz        v21, v4,  v5,  #12, .8h                   // t17a
-        rshrn_sz        v27, v6,  v7,  #12, .8h                   // t30a
+        sqrshrn_sz      v21, v4,  v5,  #12, .8h                   // t17a
+        sqrshrn_sz      v27, v6,  v7,  #12, .8h                   // t30a
         neg             v2.4s,   v2.4s                            // -> t18a
         neg             v3.4s,   v3.4s                            // -> t18a
         smull_smlsl     v4,  v5,  v19, v24, v0.h[4], v0.h[5], .8h // -> t29a
         smull_smlsl     v6,  v7,  v22, v18, v0.h[6], v0.h[7], .8h // -> t21a
-        rshrn_sz        v19, v2,  v3,  #12, .8h                   // t18a
-        rshrn_sz        v24, v4,  v5,  #12, .8h                   // t29a
+        sqrshrn_sz      v19, v2,  v3,  #12, .8h                   // t18a
+        sqrshrn_sz      v24, v4,  v5,  #12, .8h                   // t29a
         smull_smlal     v2,  v3,  v22, v18, v0.h[7], v0.h[6], .8h // -> t26a
         smull_smlal     v4,  v5,  v17, v20, v0.h[7], v0.h[6], .8h // -> t22a
-        rshrn_sz        v22, v6,  v7,  #12, .8h                   // t21a
-        rshrn_sz        v18, v2,  v3,  #12, .8h                   // t26a
+        sqrshrn_sz      v22, v6,  v7,  #12, .8h                   // t21a
+        sqrshrn_sz      v18, v2,  v3,  #12, .8h                   // t26a
         neg             v4.4s,   v4.4s                            // -> t22a
         neg             v5.4s,   v5.4s                            // -> t22a
         smull_smlsl     v6,  v7,  v17, v20, v0.h[6], v0.h[7], .8h // -> t25a
-        rshrn_sz        v17, v4,  v5,  #12, .8h                   // t22a
-        rshrn_sz        v20, v6,  v7,  #12, .8h                   // t25a
+        sqrshrn_sz      v17, v4,  v5,  #12, .8h                   // t22a
+        sqrshrn_sz      v20, v6,  v7,  #12, .8h                   // t25a
 
         sqsub           v2.8h,   v27.8h,  v24.8h // t29
         sqadd           v27.8h,  v27.8h,  v24.8h // t30
@@ -1950,23 +1950,23 @@ function inv_dct32_odd_8h_x16_neon, export=1
         smull_smlsl     v4,  v5,  v2,  v3,  v0.h[2], v0.h[3], .8h // -> t18a
         smull_smlal     v6,  v7,  v2,  v3,  v0.h[3], v0.h[2], .8h // -> t29a
         smull_smlsl     v2,  v3,  v29, v24, v0.h[2], v0.h[3], .8h // -> t19
-        rshrn_sz        v18, v4,  v5,  #12, .8h                   // t18a
-        rshrn_sz        v25, v6,  v7,  #12, .8h                   // t29a
+        sqrshrn_sz      v18, v4,  v5,  #12, .8h                   // t18a
+        sqrshrn_sz      v25, v6,  v7,  #12, .8h                   // t29a
         smull_smlal     v4,  v5,  v29, v24, v0.h[3], v0.h[2], .8h // -> t28
         smull_smlal     v6,  v7,  v26, v19, v0.h[3], v0.h[2], .8h // -> t20
-        rshrn_sz        v29, v2,  v3,  #12, .8h                   // t19
-        rshrn_sz        v24, v4,  v5,  #12, .8h                   // t28
+        sqrshrn_sz      v29, v2,  v3,  #12, .8h                   // t19
+        sqrshrn_sz      v24, v4,  v5,  #12, .8h                   // t28
         neg             v6.4s,   v6.4s                            // -> t20
         neg             v7.4s,   v7.4s                            // -> t20
         smull_smlsl     v2,  v3,  v26, v19, v0.h[2], v0.h[3], .8h // -> t27
         smull_smlal     v4,  v5,  v20, v28, v0.h[3], v0.h[2], .8h // -> t21a
-        rshrn_sz        v26, v6,  v7,  #12, .8h                   // t20
-        rshrn_sz        v19, v2,  v3,  #12, .8h                   // t27
+        sqrshrn_sz      v26, v6,  v7,  #12, .8h                   // t20
+        sqrshrn_sz      v19, v2,  v3,  #12, .8h                   // t27
         neg             v4.4s,   v4.4s                            // -> t21a
         neg             v5.4s,   v5.4s                            // -> t21a
         smull_smlsl     v6,  v7,  v20, v28, v0.h[2], v0.h[3], .8h // -> t26a
-        rshrn_sz        v20, v4,  v5,  #12, .8h                   // t21a
-        rshrn_sz        v28, v6,  v7,  #12, .8h                   // t26a
+        sqrshrn_sz      v20, v4,  v5,  #12, .8h                   // t21a
+        sqrshrn_sz      v28, v6,  v7,  #12, .8h                   // t26a
 
         sqsub           v2.8h,   v16.8h,  v30.8h // t23
         sqadd           v16.8h,  v16.8h,  v30.8h // t16 = out16
@@ -1988,24 +1988,24 @@ function inv_dct32_odd_8h_x16_neon, export=1
 
         smull_smlsl     v4,  v5,  v24, v26, v0.h[0], v0.h[0], .8h // -> t20
         smull_smlal     v6,  v7,  v24, v26, v0.h[0], v0.h[0], .8h // -> t27
-        rshrn_sz        v20, v4,  v5,  #12, .8h   // t20
-        rshrn_sz        v22, v6,  v7,  #12, .8h   // t27
+        sqrshrn_sz      v20, v4,  v5,  #12, .8h   // t20
+        sqrshrn_sz      v22, v6,  v7,  #12, .8h   // t27
 
         smull_smlal     v4,  v5,  v25, v27, v0.h[0], v0.h[0], .8h // -> t26a
         smull_smlsl     v6,  v7,  v25, v27, v0.h[0], v0.h[0], .8h // -> t21a
         mov             v27.16b,  v22.16b         // t27
-        rshrn_sz        v26, v4,  v5,  #12, .8h   // t26a
+        sqrshrn_sz      v26, v4,  v5,  #12, .8h   // t26a
 
         smull_smlsl     v24, v25, v21, v23, v0.h[0], v0.h[0], .8h // -> t22
         smull_smlal     v4,  v5,  v21, v23, v0.h[0], v0.h[0], .8h // -> t25
-        rshrn_sz        v21, v6,  v7,  #12, .8h   // t21a
-        rshrn_sz        v22, v24, v25, #12, .8h   // t22
-        rshrn_sz        v25, v4,  v5,  #12, .8h   // t25
+        sqrshrn_sz      v21, v6,  v7,  #12, .8h   // t21a
+        sqrshrn_sz      v22, v24, v25, #12, .8h   // t22
+        sqrshrn_sz      v25, v4,  v5,  #12, .8h   // t25
 
         smull_smlsl     v4,  v5,  v3,  v2,  v0.h[0], v0.h[0], .8h // -> t23a
         smull_smlal     v6,  v7,  v3,  v2,  v0.h[0], v0.h[0], .8h // -> t24a
-        rshrn_sz        v23, v4,  v5,  #12, .8h   // t23a
-        rshrn_sz        v24, v6,  v7,  #12, .8h   // t24a
+        sqrshrn_sz      v23, v4,  v5,  #12, .8h   // t23a
+        sqrshrn_sz      v24, v6,  v7,  #12, .8h   // t24a
 
         ret
 endfunc
@@ -2594,11 +2594,11 @@ function inv_dct64_step1_neon
         neg             v2.4s,   v2.4s              // t34a
         neg             v3.4s,   v3.4s              // t34a
         smull_smlsl     v6,  v7,  v30, v25, v1.h[1], v1.h[0], .8h // -> t33a
-        rshrn_sz        v26, v2,  v3,  #12, .8h     // t34a
+        sqrshrn_sz      v26, v2,  v3,  #12, .8h     // t34a
         smull_smlal     v2,  v3,  v30, v25, v1.h[0], v1.h[1], .8h // -> t62a
-        rshrn_sz        v29, v4,  v5,  #12, .8h     // t61a
-        rshrn_sz        v25, v6,  v7,  #12, .8h     // t33a
-        rshrn_sz        v30, v2,  v3,  #12, .8h     // t62a
+        sqrshrn_sz      v29, v4,  v5,  #12, .8h     // t61a
+        sqrshrn_sz      v25, v6,  v7,  #12, .8h     // t33a
+        sqrshrn_sz      v30, v2,  v3,  #12, .8h     // t62a
 
         sqadd           v16.8h,  v24.8h,  v27.8h    // t32a
         sqsub           v19.8h,  v24.8h,  v27.8h    // t35a
@@ -2612,11 +2612,11 @@ function inv_dct64_step1_neon
         smull_smlal     v2,  v3,  v21, v18, v1.h[2], v1.h[3], .8h // -> t61a
         smull_smlsl     v4,  v5,  v21, v18, v1.h[3], v1.h[2], .8h // -> t34a
         smull_smlal     v6,  v7,  v20, v19, v1.h[2], v1.h[3], .8h // -> t60
-        rshrn_sz        v21, v2,  v3,  #12, .8h     // t61a
-        rshrn_sz        v18, v4,  v5,  #12, .8h     // t34a
+        sqrshrn_sz      v21, v2,  v3,  #12, .8h     // t61a
+        sqrshrn_sz      v18, v4,  v5,  #12, .8h     // t34a
         smull_smlsl     v2,  v3,  v20, v19, v1.h[3], v1.h[2], .8h // -> t35
-        rshrn_sz        v20, v6,  v7,  #12, .8h     // t60
-        rshrn_sz        v19, v2,  v3,  #12, .8h     // t35
+        sqrshrn_sz      v20, v6,  v7,  #12, .8h     // t60
+        sqrshrn_sz      v19, v2,  v3,  #12, .8h     // t35
 
         st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64
         st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x6], #64
@@ -2653,13 +2653,13 @@ function inv_dct64_step2_neon
         smull_smlal     v2,  v3,  v27, v25, v0.h[3], v0.h[2], .8h // -> t56a
         smull_smlsl     v4,  v5,  v27, v25, v0.h[2], v0.h[3], .8h // -> t39a
         smull_smlal     v6,  v7,  v31, v28, v0.h[3], v0.h[2], .8h // -> t40a
-        rshrn_sz        v25, v2,  v3,  #12, .8h     // t56a
-        rshrn_sz        v27, v4,  v5,  #12, .8h     // t39a
+        sqrshrn_sz      v25, v2,  v3,  #12, .8h     // t56a
+        sqrshrn_sz      v27, v4,  v5,  #12, .8h     // t39a
         neg             v6.4s,   v6.4s              // t40a
         neg             v7.4s,   v7.4s              // t40a
         smull_smlsl     v2,  v3,  v31, v28, v0.h[2], v0.h[3], .8h // -> t55a
-        rshrn_sz        v31, v6,  v7,  #12, .8h     // t40a
-        rshrn_sz        v28, v2,  v3,  #12, .8h     // t55a
+        sqrshrn_sz      v31, v6,  v7,  #12, .8h     // t40a
+        sqrshrn_sz      v28, v2,  v3,  #12, .8h     // t55a
 
         sqadd           v16.8h,  v24.8h,  v29.8h    // t32a
         sqsub           v19.8h,  v24.8h,  v29.8h    // t47a
@@ -2673,11 +2673,11 @@ function inv_dct64_step2_neon
         smull_smlsl     v2,  v3,  v21, v18, v0.h[0], v0.h[0], .8h // -> t40a
         smull_smlal     v4,  v5,  v21, v18, v0.h[0], v0.h[0], .8h // -> t55a
         smull_smlsl     v6,  v7,  v20, v19, v0.h[0], v0.h[0], .8h // -> t47
-        rshrn_sz        v18, v2,  v3,  #12, .8h     // t40a
-        rshrn_sz        v21, v4,  v5,  #12, .8h     // t55a
+        sqrshrn_sz      v18, v2,  v3,  #12, .8h     // t40a
+        sqrshrn_sz      v21, v4,  v5,  #12, .8h     // t55a
         smull_smlal     v2,  v3,  v20, v19, v0.h[0], v0.h[0], .8h // -> t48
-        rshrn_sz        v19, v6,  v7,  #12, .8h     // t47
-        rshrn_sz        v20, v2,  v3,  #12, .8h     // t48
+        sqrshrn_sz      v19, v6,  v7,  #12, .8h     // t47
+        sqrshrn_sz      v20, v2,  v3,  #12, .8h     // t48
 
         str             q16, [x6, #2*8*0]  // t32a
         str             q17, [x9, #2*8*0]  // t39
diff --git a/third_party/dav1d/src/arm/64/looprestoration.S b/third_party/dav1d/src/arm/64/looprestoration.S
index 778448a0f3f02..a598b72b03951 100644
--- a/third_party/dav1d/src/arm/64/looprestoration.S
+++ b/third_party/dav1d/src/arm/64/looprestoration.S
@@ -50,6 +50,7 @@ endconst
 //                                     const int16_t filter[2][8],
 //                                     const enum LrEdgeFlags edges);
 function wiener_filter7_8bpc_neon, export=1
+        AARCH64_SIGN_LINK_REGISTER
         stp             x29, x30, [sp, #-16]!
         mov             x29, sp
         ld1             {v0.8h, v1.8h},  [x6]
@@ -121,6 +122,7 @@ L(v1_7):
 
         mov             sp,  x29
         ldp             x29, x30, [sp], #16
+        AARCH64_VALIDATE_LINK_REGISTER
         ret
 
 L(no_top_7):
@@ -538,6 +540,7 @@ endfunc
 //                                     const int16_t filter[2][8],
 //                                     const enum LrEdgeFlags edges);
 function wiener_filter5_8bpc_neon, export=1
+        AARCH64_SIGN_LINK_REGISTER
         stp             x29, x30, [sp, #-16]!
         mov             x29, sp
         ld1             {v0.8h, v1.8h},  [x6]
@@ -598,6 +601,7 @@ L(end_5):
 
         mov             sp,  x29
         ldp             x29, x30, [sp], #16
+        AARCH64_VALIDATE_LINK_REGISTER
         ret
 
 L(no_top_5):
diff --git a/third_party/dav1d/src/arm/64/looprestoration16.S b/third_party/dav1d/src/arm/64/looprestoration16.S
index fcb4f84e7ef88..8954e604cf559 100644
--- a/third_party/dav1d/src/arm/64/looprestoration16.S
+++ b/third_party/dav1d/src/arm/64/looprestoration16.S
@@ -52,6 +52,7 @@ endconst
 //                                      const int bitdepth_max);
 function wiener_filter7_16bpc_neon, export=1
         ldr             w8,  [sp]
+        AARCH64_SIGN_LINK_REGISTER
         stp             x29, x30, [sp, #-32]!
         stp             d8,  d9,  [sp, #16]
         mov             x29, sp
@@ -137,6 +138,7 @@ L(v1_7):
         mov             sp,  x29
         ldp             d8,  d9,  [sp, #16]
         ldp             x29, x30, [sp], #32
+        AARCH64_VALIDATE_LINK_REGISTER
         ret
 
 L(no_top_7):
@@ -595,6 +597,7 @@ endfunc
 //                                      const int bitdepth_max);
 function wiener_filter5_16bpc_neon, export=1
         ldr             w8,  [sp]
+        AARCH64_SIGN_LINK_REGISTER
         stp             x29, x30, [sp, #-32]!
         stp             d8,  d9,  [sp, #16]
         mov             x29, sp
@@ -669,6 +672,7 @@ L(end_5):
         mov             sp,  x29
         ldp             d8,  d9,  [sp, #16]
         ldp             x29, x30, [sp], #32
+        AARCH64_VALIDATE_LINK_REGISTER
         ret
 
 L(no_top_5):
diff --git a/third_party/dav1d/src/arm/asm.S b/third_party/dav1d/src/arm/asm.S
index 017c89c9117d7..d1083c6b561b5 100644
--- a/third_party/dav1d/src/arm/asm.S
+++ b/third_party/dav1d/src/arm/asm.S
@@ -34,10 +34,78 @@
 #define x18 do_not_use_x18
 #define w18 do_not_use_w18
 
-/* Support macros for the Armv8.5-A Branch Target Identification feature which
- * requires emitting a .note.gnu.property section with the appropriate
- * architecture-dependent feature bits set.
- * Read more: "ELF for the Arm® 64-bit Architecture"
+/* Support macros for
+ *   - Armv8.3-A Pointer Authentication and
+ *   - Armv8.5-A Branch Target Identification
+ * features which require emitting a .note.gnu.property section with the
+ * appropriate architecture-dependent feature bits set.
+ *
+ * |AARCH64_SIGN_LINK_REGISTER| and |AARCH64_VALIDATE_LINK_REGISTER| expand to
+ * PACIxSP and AUTIxSP, respectively. |AARCH64_SIGN_LINK_REGISTER| should be
+ * used immediately before saving the LR register (x30) to the stack.
+ * |AARCH64_VALIDATE_LINK_REGISTER| should be used immediately after restoring
+ * it. Note |AARCH64_SIGN_LINK_REGISTER|'s modifications to LR must be undone
+ * with |AARCH64_VALIDATE_LINK_REGISTER| before RET. The SP register must also
+ * have the same value at the two points. For example:
+ *
+ *   .global f
+ *   f:
+ *     AARCH64_SIGN_LINK_REGISTER
+ *     stp x29, x30, [sp, #-96]!
+ *     mov x29, sp
+ *     ...
+ *     ldp x29, x30, [sp], #96
+ *     AARCH64_VALIDATE_LINK_REGISTER
+ *     ret
+ *
+ * |AARCH64_VALID_CALL_TARGET| expands to BTI 'c'. Either it, or
+ * |AARCH64_SIGN_LINK_REGISTER|, must be used at every point that may be an
+ * indirect call target. In particular, all symbols exported from a file must
+ * begin with one of these macros. For example, a leaf function that does not
+ * save LR can instead use |AARCH64_VALID_CALL_TARGET|:
+ *
+ *   .globl return_zero
+ *   return_zero:
+ *     AARCH64_VALID_CALL_TARGET
+ *     mov x0, #0
+ *     ret
+ *
+ * A non-leaf function which does not immediately save LR may need both macros
+ * because |AARCH64_SIGN_LINK_REGISTER| appears late. For example, the function
+ * may jump to an alternate implementation before setting up the stack:
+ *
+ *   .globl with_early_jump
+ *   with_early_jump:
+ *     AARCH64_VALID_CALL_TARGET
+ *     cmp x0, #128
+ *     b.lt .Lwith_early_jump_128
+ *     AARCH64_SIGN_LINK_REGISTER
+ *     stp x29, x30, [sp, #-96]!
+ *     mov x29, sp
+ *     ...
+ *     ldp x29, x30, [sp], #96
+ *     AARCH64_VALIDATE_LINK_REGISTER
+ *     ret
+ *
+ *  .Lwith_early_jump_128:
+ *     ...
+ *     ret
+ *
+ * These annotations are only required with indirect calls. Private symbols that
+ * are only the target of direct calls do not require annotations. Also note
+ * that |AARCH64_VALID_CALL_TARGET| is only valid for indirect calls (BLR), not
+ * indirect jumps (BR). Indirect jumps in assembly are supported through
+ * |AARCH64_VALID_JUMP_TARGET|. Landing Pads which shall serve for jumps and
+ * calls can be created using |AARCH64_VALID_JUMP_CALL_TARGET|.
+ *
+ * Although not necessary, it is safe to use these macros in 32-bit ARM
+ * assembly. This may be used to simplify dual 32-bit and 64-bit files.
+ *
+ * References:
+ * - "ELF for the Arm® 64-bit Architecture"
+ *   https: *github.com/ARM-software/abi-aa/blob/master/aaelf64/aaelf64.rst
+ * - "Providing protection for complex software"
+ *   https://developer.arm.com/architectures/learn-the-architecture/providing-protection-for-complex-software
  */
 #if defined(__ARM_FEATURE_BTI_DEFAULT) && (__ARM_FEATURE_BTI_DEFAULT == 1)
 #define GNU_PROPERTY_AARCH64_BTI (1 << 0)   // Has Branch Target Identification
@@ -51,7 +119,32 @@
 #define AARCH64_VALID_JUMP_TARGET
 #endif
 
-#if (GNU_PROPERTY_AARCH64_BTI != 0)
+#if defined(__ARM_FEATURE_PAC_DEFAULT)
+
+#if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 0)) != 0) // authentication using key A
+#define AARCH64_SIGN_LINK_REGISTER      paciasp
+#define AARCH64_VALIDATE_LINK_REGISTER  autiasp
+#elif ((__ARM_FEATURE_PAC_DEFAULT & (1 << 1)) != 0) // authentication using key B
+#define AARCH64_SIGN_LINK_REGISTER      pacibsp
+#define AARCH64_VALIDATE_LINK_REGISTER  autibsp
+#else
+#error Pointer authentication defines no valid key!
+#endif
+#if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 2)) != 0) // authentication of leaf functions
+#error Authentication of leaf functions is enabled but not supported in dav1d!
+#endif
+#define GNU_PROPERTY_AARCH64_PAC (1 << 1)
+
+#else /* __ARM_FEATURE_PAC_DEFAULT */
+
+#define GNU_PROPERTY_AARCH64_PAC 0
+#define AARCH64_SIGN_LINK_REGISTER
+#define AARCH64_VALIDATE_LINK_REGISTER
+
+#endif /* !__ARM_FEATURE_PAC_DEFAULT */
+
+
+#if (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__)
         .pushsection .note.gnu.property, "a"
         .balign 8
         .long 4
@@ -60,11 +153,11 @@
         .asciz "GNU"
         .long 0xc0000000 /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */
         .long 4
-        .long GNU_PROPERTY_AARCH64_BTI
+        .long (GNU_PROPERTY_AARCH64_BTI | GNU_PROPERTY_AARCH64_PAC)
         .long 0
         .popsection
-#endif
-#endif
+#endif /* (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__) */
+#endif /* ARCH_AARCH64 */
 
 #if ARCH_ARM
         .syntax unified
@@ -74,7 +167,7 @@
         .eabi_attribute 10, 0           // suppress Tag_FP_arch
         .eabi_attribute 12, 0           // suppress Tag_Advanced_SIMD_arch
         .section .note.GNU-stack,"",%progbits // Mark stack as non-executable
-#endif
+#endif /* __ELF__ */
 
 #ifdef _WIN32
 #define CONFIG_THUMB 1
@@ -89,8 +182,8 @@
 #else
 #define A
 #define T @
-#endif
-#endif
+#endif /* CONFIG_THUMB */
+#endif /* ARCH_ARM */
 
 #if !defined(PIC)
 #if defined(__PIC__)
diff --git a/third_party/dav1d/src/arm/film_grain_init_tmpl.c b/third_party/dav1d/src/arm/filmgrain_init_tmpl.c
similarity index 99%
rename from third_party/dav1d/src/arm/film_grain_init_tmpl.c
rename to third_party/dav1d/src/arm/filmgrain_init_tmpl.c
index 3a416020532f5..2156047d02ad6 100644
--- a/third_party/dav1d/src/arm/film_grain_init_tmpl.c
+++ b/third_party/dav1d/src/arm/filmgrain_init_tmpl.c
@@ -28,7 +28,7 @@
  */
 
 #include "src/cpu.h"
-#include "src/film_grain.h"
+#include "src/filmgrain.h"
 #include "asm-offsets.h"
 
 CHECK_OFFSET(Dav1dFilmGrainData, seed, FGD_SEED);
diff --git a/third_party/dav1d/src/data.c b/third_party/dav1d/src/data.c
index 4c7bf82715c48..fa6165ec72179 100644
--- a/third_party/dav1d/src/data.c
+++ b/third_party/dav1d/src/data.c
@@ -116,11 +116,17 @@ void dav1d_data_props_copy(Dav1dDataProps *const dst,
 void dav1d_data_props_set_defaults(Dav1dDataProps *const props) {
     assert(props != NULL);
 
+    memset(props, 0, sizeof(*props));
     props->timestamp = INT64_MIN;
-    props->duration = 0;
     props->offset = -1;
-    props->user_data.data = NULL;
-    props->user_data.ref = NULL;
+}
+
+void dav1d_data_props_unref_internal(Dav1dDataProps *const props) {
+    validate_input(props != NULL);
+
+    struct Dav1dRef *user_data_ref = props->user_data.ref;
+    dav1d_data_props_set_defaults(props);
+    dav1d_ref_dec(&user_data_ref);
 }
 
 void dav1d_data_unref_internal(Dav1dData *const buf) {
@@ -132,5 +138,6 @@ void dav1d_data_unref_internal(Dav1dData *const buf) {
         dav1d_ref_dec(&buf->ref);
     }
     memset(buf, 0, sizeof(*buf));
+    dav1d_data_props_set_defaults(&buf->m);
     dav1d_ref_dec(&user_data_ref);
 }
diff --git a/third_party/dav1d/src/data.h b/third_party/dav1d/src/data.h
index 5b07021c532ef..b34c1db702ca3 100644
--- a/third_party/dav1d/src/data.h
+++ b/third_party/dav1d/src/data.h
@@ -51,5 +51,6 @@ int dav1d_data_wrap_user_data_internal(Dav1dData *buf,
                                                              void *cookie),
                                        void *cookie);
 void dav1d_data_unref_internal(Dav1dData *buf);
+void dav1d_data_props_unref_internal(Dav1dDataProps *props);
 
 #endif /* DAV1D_SRC_DATA_H */
diff --git a/third_party/dav1d/src/decode.c b/third_party/dav1d/src/decode.c
index cec91e9fb3009..bd13014947e3b 100644
--- a/third_party/dav1d/src/decode.c
+++ b/third_party/dav1d/src/decode.c
@@ -42,7 +42,7 @@
 #include "src/decode.h"
 #include "src/dequant_tables.h"
 #include "src/env.h"
-#include "src/film_grain.h"
+#include "src/filmgrain.h"
 #include "src/log.h"
 #include "src/qm.h"
 #include "src/recon.h"
@@ -1242,6 +1242,7 @@ static int decode_b(Dav1dTaskContext *const t,
             if (DEBUG_BLOCK_INFO)
                 printf("Post-uvmode[%d]: r=%d\n", b->uv_mode, ts->msac.rng);
 
+            b->uv_angle = 0;
             if (b->uv_mode == CFL_PRED) {
 #define SIGN(a) (!!(a) + ((a) > 0))
                 const int sign = dav1d_msac_decode_symbol_adapt8(&ts->msac,
@@ -1274,8 +1275,6 @@ static int decode_b(Dav1dTaskContext *const t,
                 uint16_t *const acdf = ts->cdf.m.angle_delta[b->uv_mode - VERT_PRED];
                 const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 6);
                 b->uv_angle = angle - 3;
-            } else {
-                b->uv_angle = 0;
             }
         }
 
@@ -3231,8 +3230,6 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
         if (ret < 0) goto error;
     }
 
-    retval = DAV1D_ERR(EINVAL);
-
     // setup dequant tables
     init_quant_tables(f->seq_hdr, f->frame_hdr, f->frame_hdr->quant.yac, f->dq);
     if (f->frame_hdr->quant.qm)
@@ -3356,7 +3353,7 @@ error:
 
 int dav1d_decode_frame_main(Dav1dFrameContext *const f) {
     const Dav1dContext *const c = f->c;
-    int retval = DAV1D_ERR(ENOMEM);
+    int retval = DAV1D_ERR(EINVAL);
 
     assert(f->c->n_tc == 1);
 
@@ -3500,7 +3497,13 @@ int dav1d_submit_frame(Dav1dContext *const c) {
             if (c->task_thread.cur && c->task_thread.cur < c->n_fc)
                 c->task_thread.cur--;
         }
-        if (out_delayed->p.data[0]) {
+        const int error = f->task_thread.retval;
+        if (error) {
+            f->task_thread.retval = 0;
+            c->cached_error = error;
+            dav1d_data_props_copy(&c->cached_error_props, &out_delayed->p.m);
+            dav1d_thread_picture_unref(out_delayed);
+        } else if (out_delayed->p.data[0]) {
             const unsigned progress = atomic_load_explicit(&out_delayed->progress[1],
                                                            memory_order_relaxed);
             if ((out_delayed->visible || c->output_invisible_frames) &&
@@ -3842,7 +3845,7 @@ int dav1d_submit_frame(Dav1dContext *const c) {
                     dav1d_ref_dec(&c->refs[i].refmvs);
                 }
             }
-            return res;
+            goto error;
         }
     } else {
         dav1d_task_frame_init(f);
@@ -3869,6 +3872,7 @@ error:
     dav1d_ref_dec(&f->mvs_ref);
     dav1d_ref_dec(&f->seq_hdr_ref);
     dav1d_ref_dec(&f->frame_hdr_ref);
+    dav1d_data_props_copy(&c->cached_error_props, &c->in.m);
 
     for (int i = 0; i < f->n_tile_data; i++)
         dav1d_data_unref_internal(&f->tile[i].data);
diff --git a/third_party/dav1d/src/ext/x86/x86inc.asm b/third_party/dav1d/src/ext/x86/x86inc.asm
index c9f2d6398cde1..68b1f74f4bb02 100644
--- a/third_party/dav1d/src/ext/x86/x86inc.asm
+++ b/third_party/dav1d/src/ext/x86/x86inc.asm
@@ -1,7 +1,7 @@
 ;*****************************************************************************
 ;* x86inc.asm: x86 abstraction layer
 ;*****************************************************************************
-;* Copyright (C) 2005-2021 x264 project
+;* Copyright (C) 2005-2022 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@xxxxxxxxxxxxxxxx>
 ;*          Henrik Gramner <henrik@xxxxxxxxxxx>
@@ -238,6 +238,16 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
 %endif
 %endmacro
 
+; Repeats an instruction/operation for multiple arguments.
+; Example usage: "REPX {psrlw x, 8}, m0, m1, m2, m3"
+%macro REPX 2-* ; operation, args
+    %xdefine %%f(x) %1
+    %rep %0 - 1
+        %rotate 1
+        %%f(%1)
+    %endrep
+%endmacro
+
 %macro PUSH 1
     push %1
     %ifidn rstk, rsp
@@ -1342,7 +1352,20 @@ INIT_XMM
             %1 %6, __src2
         %endif
     %elif %0 >= 9
-        __instr %6, %7, %8, %9
+        %if avx_enabled && __sizeofreg >= 16 && %4 == 1
+            %ifnnum regnumof%7
+                %if %3
+                    vmovaps %6, %7
+                %else
+                    vmovdqa %6, %7
+                %endif
+                __instr %6, %6, %8, %9
+            %else
+                __instr %6, %7, %8, %9
+            %endif
+        %else
+            __instr %6, %7, %8, %9
+        %endif
     %elif %0 == 8
         %if avx_enabled && __sizeofreg >= 16 && %4 == 0
             %xdefine __src1 %7
@@ -1379,7 +1402,7 @@ INIT_XMM
                 %else
                     vmovdqa %6, %7
                 %endif
-                __instr %6, %8
+                __instr %6, %6, %8
             %else
                 __instr %6, __src1, __src2
             %endif
@@ -1448,8 +1471,8 @@ AVX_INSTR andpd, sse2, 1, 0, 1
 AVX_INSTR andps, sse, 1, 0, 1
 AVX_INSTR blendpd, sse4, 1, 1, 0
 AVX_INSTR blendps, sse4, 1, 1, 0
-AVX_INSTR blendvpd, sse4 ; can't be emulated
-AVX_INSTR blendvps, sse4 ; can't be emulated
+AVX_INSTR blendvpd, sse4, 1, 1, 0 ; last operand must be xmm0 with legacy encoding
+AVX_INSTR blendvps, sse4, 1, 1, 0 ; last operand must be xmm0 with legacy encoding
 AVX_INSTR cmpeqpd, sse2, 1, 0, 1
 AVX_INSTR cmpeqps, sse, 1, 0, 1
 AVX_INSTR cmpeqsd, sse2, 1, 0, 0
@@ -1582,7 +1605,7 @@ AVX_INSTR pand, mmx, 0, 0, 1
 AVX_INSTR pandn, mmx, 0, 0, 0
 AVX_INSTR pavgb, mmx2, 0, 0, 1
 AVX_INSTR pavgw, mmx2, 0, 0, 1
-AVX_INSTR pblendvb, sse4 ; can't be emulated
+AVX_INSTR pblendvb, sse4, 0, 1, 0 ; last operand must be xmm0 with legacy encoding
 AVX_INSTR pblendw, sse4, 0, 1, 0
 AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0
 AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0
diff --git a/third_party/dav1d/src/fg_apply.h b/third_party/dav1d/src/fg_apply.h
index 779549bb2d020..be6685d8018bc 100644
--- a/third_party/dav1d/src/fg_apply.h
+++ b/third_party/dav1d/src/fg_apply.h
@@ -32,10 +32,27 @@
 
 #include "common/bitdepth.h"
 
-#include "src/film_grain.h"
+#include "src/filmgrain.h"
 
-bitfn_decls(void dav1d_apply_grain, const Dav1dFilmGrainDSPContext *const dsp,
-                                    Dav1dPicture *const out,
-                                    const Dav1dPicture *const in);
+#ifdef BITDEPTH
+# define array_decl(type, name, sz) type name sz
+#else
+# define array_decl(type, name, sz) void *name
+#endif
+
+bitfn_decls(void dav1d_apply_grain,
+            const Dav1dFilmGrainDSPContext *const dsp,
+            Dav1dPicture *const out, const Dav1dPicture *const in);
+bitfn_decls(void dav1d_prep_grain,
+            const Dav1dFilmGrainDSPContext *const dsp,
+            Dav1dPicture *const out, const Dav1dPicture *const in,
+            array_decl(uint8_t, scaling, [3][SCALING_SIZE]),
+            array_decl(entry, grain_lut, [3][GRAIN_HEIGHT+1][GRAIN_WIDTH]));
+bitfn_decls(void dav1d_apply_grain_row,
+            const Dav1dFilmGrainDSPContext *const dsp,
+            Dav1dPicture *const out, const Dav1dPicture *const in,
+            array_decl(const uint8_t, scaling, [3][SCALING_SIZE]),
+            array_decl(const entry, grain_lut, [3][GRAIN_HEIGHT+1][GRAIN_WIDTH]),
+            const int row);
 
 #endif /* DAV1D_SRC_FG_APPLY_H */
diff --git a/third_party/dav1d/src/fg_apply_tmpl.c b/third_party/dav1d/src/fg_apply_tmpl.c
index c254a3dffa4f8..ee14db9a4cea6 100644
--- a/third_party/dav1d/src/fg_apply_tmpl.c
+++ b/third_party/dav1d/src/fg_apply_tmpl.c
@@ -30,13 +30,13 @@
 
 #include <stdint.h>
 
+#include "dav1d/common.h"
 #include "dav1d/picture.h"
 
-#include "common.h"
 #include "common/intops.h"
 #include "common/bitdepth.h"
 
-#include "fg_apply.h"
+#include "src/fg_apply.h"
 
 static void generate_scaling(const int bitdepth,
                              const uint8_t points[][2], const int num,
@@ -44,14 +44,15 @@ static void generate_scaling(const int bitdepth,
 {
 #if BITDEPTH == 8
     const int shift_x = 0;
+    const int scaling_size = SCALING_SIZE;
 #else
+    assert(bitdepth > 8);
     const int shift_x = bitdepth - 8;
-#endif
     const int scaling_size = 1 << bitdepth;
+#endif
 
     // Fill up the preceding entries with the initial value
-    for (int i = 0; i < points[0][0] << shift_x; i++)
-        scaling[i] = points[0][1];
+    memset(scaling, points[0][1], points[0][0] << shift_x);
 
     // Linearly interpolate the values in the middle
     for (int i = 0; i < num - 1; i++) {
@@ -61,16 +62,17 @@ static void generate_scaling(const int bitdepth,
         const int ey = points[i+1][1];
         const int dx = ex - bx;
         const int dy = ey - by;
+        assert(dx > 0);
         const int delta = dy * ((0x10000 + (dx >> 1)) / dx);
-        for (int x = 0; x < dx; x++) {
-            const int v = by + ((x * delta + 0x8000) >> 16);
-            scaling[(bx + x) << shift_x] = v;
+        for (int x = 0, d = 0x8000; x < dx; x++) {
+            scaling[(bx + x) << shift_x] = by + (d >> 16);
+            d += delta;
         }
     }
 
     // Fill up the remaining entries with the final value
-    for (int i = points[num - 1][0] << shift_x; i < scaling_size; i++)
-        scaling[i] = points[num - 1][1];
+    const int n = points[num - 1][0] << shift_x;
+    memset(&scaling[n], points[num - 1][1], scaling_size - n);
 
 #if BITDEPTH != 8
     const int pad = 1 << shift_x, rnd = pad >> 1;
@@ -80,8 +82,9 @@ static void generate_scaling(const int bitdepth,
         const int dx = ex - bx;
         for (int x = 0; x < dx; x += pad) {
             const int range = scaling[bx + x + pad] - scaling[bx + x];
-            for (int n = 1; n < pad; n++) {
-                scaling[bx + x + n] = scaling[bx + x] + ((range * n + rnd) >> shift_x);
+            for (int n = 1, r = rnd; n < pad; n++) {
+                r += range;
+                scaling[bx + x + n] = scaling[bx + x] + (r >> shift_x);
             }
         }
     }
@@ -89,14 +92,13 @@ static void generate_scaling(const int bitdepth,
 }
 
 #ifndef UNIT_TEST
-void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp,
-                              Dav1dPicture *const out,
-                              const Dav1dPicture *const in)
+void bitfn(dav1d_prep_grain)(const Dav1dFilmGrainDSPContext *const dsp,
+                             Dav1dPicture *const out,
+                             const Dav1dPicture *const in,
+                             uint8_t scaling[3][SCALING_SIZE],
+                             entry grain_lut[3][GRAIN_HEIGHT+1][GRAIN_WIDTH])
 {
     const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;
-
-    ALIGN_STK_16(entry, grain_lut, 3,[GRAIN_HEIGHT + 1][GRAIN_WIDTH]);
-    uint8_t scaling[3][SCALING_SIZE];
 #if BITDEPTH != 8
     const int bitdepth_max = (1 << out->p.bpc) - 1;
 #endif
@@ -150,60 +152,86 @@ void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp,
                 memcpy(out->data[2], in->data[2], sz);
         }
     }
+}
 
+void bitfn(dav1d_apply_grain_row)(const Dav1dFilmGrainDSPContext *const dsp,
+                                  Dav1dPicture *const out,
+                                  const Dav1dPicture *const in,
+                                  const uint8_t scaling[3][SCALING_SIZE],
+                                  const entry grain_lut[3][GRAIN_HEIGHT+1][GRAIN_WIDTH],
+                                  const int row)
+{
     // Synthesize grain for the affected planes
-    const int rows = (out->p.h + 31) >> 5;
+    const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;
     const int ss_y = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
     const int ss_x = in->p.layout != DAV1D_PIXEL_LAYOUT_I444;
     const int cpw = (out->p.w + ss_x) >> ss_x;
     const int is_id = out->seq_hdr->mtrx == DAV1D_MC_IDENTITY;
-    for (int row = 0; row < rows; row++) {
-        pixel *const luma_src =
-            ((pixel *) in->data[0]) + row * BLOCK_SIZE * PXSTRIDE(in->stride[0]);
-
-        if (data->num_y_points) {
-            const int bh = imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE);
-            dsp->fgy_32x32xn(((pixel *) out->data[0]) + row * BLOCK_SIZE * PXSTRIDE(out->stride[0]),
-                             luma_src, out->stride[0], data,
-                             out->p.w, scaling[0], grain_lut[0], bh, row HIGHBD_TAIL_SUFFIX);
-        }
+    pixel *const luma_src =
+        ((pixel *) in->data[0]) + row * BLOCK_SIZE * PXSTRIDE(in->stride[0]);
+#if BITDEPTH != 8
+    const int bitdepth_max = (1 << out->p.bpc) - 1;
+#endif
 
-        if (!data->num_uv_points[0] && !data->num_uv_points[1] &&
-            !data->chroma_scaling_from_luma)
-        {
-            continue;
-        }
+    if (data->num_y_points) {
+        const int bh = imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE);
+        dsp->fgy_32x32xn(((pixel *) out->data[0]) + row * BLOCK_SIZE * PXSTRIDE(out->stride[0]),
+                         luma_src, out->stride[0], data,
+                         out->p.w, scaling[0], grain_lut[0], bh, row HIGHBD_TAIL_SUFFIX);
+    }
 
-        const int bh = (imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE) + ss_y) >> ss_y;
+    if (!data->num_uv_points[0] && !data->num_uv_points[1] &&
+        !data->chroma_scaling_from_luma)
+    {
+        return;
+    }
 
-        // extend padding pixels
-        if (out->p.w & ss_x) {
-            pixel *ptr = luma_src;
-            for (int y = 0; y < bh; y++) {
-                ptr[out->p.w] = ptr[out->p.w - 1];
-                ptr += PXSTRIDE(in->stride[0]) << ss_y;
-            }
+    const int bh = (imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE) + ss_y) >> ss_y;
+
+    // extend padding pixels
+    if (out->p.w & ss_x) {
+        pixel *ptr = luma_src;
+        for (int y = 0; y < bh; y++) {
+            ptr[out->p.w] = ptr[out->p.w - 1];
+            ptr += PXSTRIDE(in->stride[0]) << ss_y;
         }
+    }
 
-        const ptrdiff_t uv_off = row * BLOCK_SIZE * PXSTRIDE(out->stride[1]) >> ss_y;
-        if (data->chroma_scaling_from_luma) {
-            for (int pl = 0; pl < 2; pl++)
+    const ptrdiff_t uv_off = row * BLOCK_SIZE * PXSTRIDE(out->stride[1]) >> ss_y;
+    if (data->chroma_scaling_from_luma) {
+        for (int pl = 0; pl < 2; pl++)
+            dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off,
+                                                ((const pixel *) in->data[1 + pl]) + uv_off,
+                                                in->stride[1], data, cpw,
+                                                scaling[0], grain_lut[1 + pl],
+                                                bh, row, luma_src, in->stride[0],
+                                                pl, is_id HIGHBD_TAIL_SUFFIX);
+    } else {
+        for (int pl = 0; pl < 2; pl++)
+            if (data->num_uv_points[pl])
                 dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off,
                                                     ((const pixel *) in->data[1 + pl]) + uv_off,
                                                     in->stride[1], data, cpw,
-                                                    scaling[0], grain_lut[1 + pl],
+                                                    scaling[1 + pl], grain_lut[1 + pl],
                                                     bh, row, luma_src, in->stride[0],
                                                     pl, is_id HIGHBD_TAIL_SUFFIX);
-        } else {
-            for (int pl = 0; pl < 2; pl++)
-                if (data->num_uv_points[pl])
-                    dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off,
-                                                        ((const pixel *) in->data[1 + pl]) + uv_off,
-                                                        in->stride[1], data, cpw,
-                                                        scaling[1 + pl], grain_lut[1 + pl],
-                                                        bh, row, luma_src, in->stride[0],
-                                                        pl, is_id HIGHBD_TAIL_SUFFIX);
-        }
     }
 }
+
+void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp,
+                              Dav1dPicture *const out,
+                              const Dav1dPicture *const in)
+{
+    ALIGN_STK_16(entry, grain_lut, 3,[GRAIN_HEIGHT + 1][GRAIN_WIDTH]);
+#if ARCH_X86_64 && BITDEPTH == 8
+    ALIGN_STK_64(uint8_t, scaling, 3,[SCALING_SIZE]);
+#else
+    uint8_t scaling[3][SCALING_SIZE];
+#endif
+    const int rows = (out->p.h + 31) >> 5;
+
+    bitfn(dav1d_prep_grain)(dsp, out, in, scaling, grain_lut);
+    for (int row = 0; row < rows; row++)
+        bitfn(dav1d_apply_grain_row)(dsp, out, in, scaling, grain_lut, row);
+}
 #endif
diff --git a/third_party/dav1d/src/filmgrain.h b/third_party/dav1d/src/filmgrain.h
new file mode 100644
index 0000000000000..d953542a82a73
--- /dev/null
+++ b/third_party/dav1d/src/filmgrain.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_FILM_GRAIN_H
+#define DAV1D_SRC_FILM_GRAIN_H
+
+#include "common/bitdepth.h"
+
+#include "src/levels.h"
+
+#define GRAIN_WIDTH 82
+#define GRAIN_HEIGHT 73
+#define BLOCK_SIZE 32
+#if !defined(BITDEPTH) || BITDEPTH == 8
+#define SCALING_SIZE 256
+typedef int8_t entry;
+#else
+#define SCALING_SIZE 4096
+typedef int16_t entry;
+#endif
+
+#define decl_generate_grain_y_fn(name) \
+void (name)(entry buf[][GRAIN_WIDTH], \
+            const Dav1dFilmGrainData *const data HIGHBD_DECL_SUFFIX)
+typedef decl_generate_grain_y_fn(*generate_grain_y_fn);
+
+#define decl_generate_grain_uv_fn(name) \
+void (name)(entry buf[][GRAIN_WIDTH], \
+            const entry buf_y[][GRAIN_WIDTH], \
+            const Dav1dFilmGrainData *const data, const intptr_t uv HIGHBD_DECL_SUFFIX)
+typedef decl_generate_grain_uv_fn(*generate_grain_uv_fn);
+
+#define decl_fgy_32x32xn_fn(name) \
+void (name)(pixel *dst_row, const pixel *src_row, ptrdiff_t stride, \
+            const Dav1dFilmGrainData *data, \
+            size_t pw, const uint8_t scaling[SCALING_SIZE], \
+            const entry grain_lut[][GRAIN_WIDTH], \
+            int bh, int row_num HIGHBD_DECL_SUFFIX)
+typedef decl_fgy_32x32xn_fn(*fgy_32x32xn_fn);
+
+#define decl_fguv_32x32xn_fn(name) \
+void (name)(pixel *dst_row, const pixel *src_row, ptrdiff_t stride, \
+            const Dav1dFilmGrainData *data, int pw, \
+            const uint8_t scaling[SCALING_SIZE], \
+            const entry grain_lut[][GRAIN_WIDTH], int bh, int row_num, \
+            const pixel *luma_row, ptrdiff_t luma_stride, \
+            int uv_pl, int is_id HIGHBD_DECL_SUFFIX)
+typedef decl_fguv_32x32xn_fn(*fguv_32x32xn_fn);
+
+typedef struct Dav1dFilmGrainDSPContext {
+    generate_grain_y_fn generate_grain_y;
+    generate_grain_uv_fn generate_grain_uv[3];
+
+    fgy_32x32xn_fn fgy_32x32xn;
+    fguv_32x32xn_fn fguv_32x32xn[3];
+} Dav1dFilmGrainDSPContext;
+
+bitfn_decls(void dav1d_film_grain_dsp_init, Dav1dFilmGrainDSPContext *c);
+bitfn_decls(void dav1d_film_grain_dsp_init_arm, Dav1dFilmGrainDSPContext *c);
+bitfn_decls(void dav1d_film_grain_dsp_init_x86, Dav1dFilmGrainDSPContext *c);
+
+#endif /* DAV1D_SRC_FILM_GRAIN_H */
diff --git a/third_party/dav1d/src/filmgrain_tmpl.c b/third_party/dav1d/src/filmgrain_tmpl.c
new file mode 100644
index 0000000000000..883c5cbb7b9a8
--- /dev/null
+++ b/third_party/dav1d/src/filmgrain_tmpl.c
@@ -0,0 +1,433 @@
+/*
+ * Copyright © 2018, Niklas Haas
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "common/attributes.h"
+#include "common/intops.h"
+
+#include "src/filmgrain.h"
+#include "src/tables.h"
+
+#define SUB_GRAIN_WIDTH 44
+#define SUB_GRAIN_HEIGHT 38
+
+static inline int get_random_number(const int bits, unsigned *const state) {
+    const int r = *state;
+    unsigned bit = ((r >> 0) ^ (r >> 1) ^ (r >> 3) ^ (r >> 12)) & 1;
+    *state = (r >> 1) | (bit << 15);
+
+    return (*state >> (16 - bits)) & ((1 << bits) - 1);
+}
+
+static inline int round2(const int x, const uint64_t shift) {
+    return (x + ((1 << shift) >> 1)) >> shift;
+}
+
+static void generate_grain_y_c(entry buf[][GRAIN_WIDTH],
+                               const Dav1dFilmGrainData *const data
+                               HIGHBD_DECL_SUFFIX)
+{
+    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+    unsigned seed = data->seed;
+    const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift;
+    const int grain_ctr = 128 << bitdepth_min_8;
+    const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
+
+    for (int y = 0; y < GRAIN_HEIGHT; y++) {
+        for (int x = 0; x < GRAIN_WIDTH; x++) {
+            const int value = get_random_number(11, &seed);
+            buf[y][x] = round2(dav1d_gaussian_sequence[ value ], shift);
+        }
+    }
+
+    const int ar_pad = 3;
+    const int ar_lag = data->ar_coeff_lag;
+
+    for (int y = ar_pad; y < GRAIN_HEIGHT; y++) {
+        for (int x = ar_pad; x < GRAIN_WIDTH - ar_pad; x++) {
+            const int8_t *coeff = data->ar_coeffs_y;
+            int sum = 0;
+            for (int dy = -ar_lag; dy <= 0; dy++) {
+                for (int dx = -ar_lag; dx <= ar_lag; dx++) {
+                    if (!dx && !dy)
+                        break;
+                    sum += *(coeff++) * buf[y + dy][x + dx];
+                }
+            }
+
+            const int grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
+            buf[y][x] = iclip(grain, grain_min, grain_max);
+        }
+    }
+}
+
+static NOINLINE void
+generate_grain_uv_c(entry buf[][GRAIN_WIDTH],
+                    const entry buf_y[][GRAIN_WIDTH],
+                    const Dav1dFilmGrainData *const data, const intptr_t uv,
+                    const int subx, const int suby HIGHBD_DECL_SUFFIX)
+{
+    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+    unsigned seed = data->seed ^ (uv ? 0x49d8 : 0xb524);
+    const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift;
+    const int grain_ctr = 128 << bitdepth_min_8;
+    const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
+
+    const int chromaW = subx ? SUB_GRAIN_WIDTH  : GRAIN_WIDTH;
+    const int chromaH = suby ? SUB_GRAIN_HEIGHT : GRAIN_HEIGHT;
+
+    for (int y = 0; y < chromaH; y++) {
+        for (int x = 0; x < chromaW; x++) {
+            const int value = get_random_number(11, &seed);
+            buf[y][x] = round2(dav1d_gaussian_sequence[ value ], shift);
+        }
+    }
+
+    const int ar_pad = 3;
+    const int ar_lag = data->ar_coeff_lag;
+
+    for (int y = ar_pad; y < chromaH; y++) {
+        for (int x = ar_pad; x < chromaW - ar_pad; x++) {
+            const int8_t *coeff = data->ar_coeffs_uv[uv];
+            int sum = 0;
+            for (int dy = -ar_lag; dy <= 0; dy++) {
+                for (int dx = -ar_lag; dx <= ar_lag; dx++) {
+                    // For the final (current) pixel, we need to add in the
+                    // contribution from the luma grain texture
+                    if (!dx && !dy) {
+                        if (!data->num_y_points)
+                            break;
+                        int luma = 0;
+                        const int lumaX = ((x - ar_pad) << subx) + ar_pad;
+                        const int lumaY = ((y - ar_pad) << suby) + ar_pad;
+                        for (int i = 0; i <= suby; i++) {
+                            for (int j = 0; j <= subx; j++) {
+                                luma += buf_y[lumaY + i][lumaX + j];
+                            }
+                        }
+                        luma = round2(luma, subx + suby);
+                        sum += luma * (*coeff);
+                        break;
+                    }
+
+                    sum += *(coeff++) * buf[y + dy][x + dx];
+                }
+            }
+
+            const int grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
+            buf[y][x] = iclip(grain, grain_min, grain_max);
+        }
+    }
+}
+
+#define gnuv_ss_fn(nm, ss_x, ss_y) \
+static decl_generate_grain_uv_fn(generate_grain_uv_##nm##_c) { \
+    generate_grain_uv_c(buf, buf_y, data, uv, ss_x, ss_y HIGHBD_TAIL_SUFFIX); \
+}
+
+gnuv_ss_fn(420, 1, 1);
+gnuv_ss_fn(422, 1, 0);
+gnuv_ss_fn(444, 0, 0);
+
+// samples from the correct block of a grain LUT, while taking into account the
+// offsets provided by the offsets cache
+static inline entry sample_lut(const entry grain_lut[][GRAIN_WIDTH],
+                               const int offsets[2][2], const int subx, const int suby,
+                               const int bx, const int by, const int x, const int y)
+{
+    const int randval = offsets[bx][by];
+    const int offx = 3 + (2 >> subx) * (3 + (randval >> 4));
+    const int offy = 3 + (2 >> suby) * (3 + (randval & 0xF));
+    return grain_lut[offy + y + (BLOCK_SIZE >> suby) * by]
+                    [offx + x + (BLOCK_SIZE >> subx) * bx];
+}
+
+static void fgy_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
+                          const ptrdiff_t stride,
+                          const Dav1dFilmGrainData *const data, const size_t pw,
+                          const uint8_t scaling[SCALING_SIZE],
+                          const entry grain_lut[][GRAIN_WIDTH],
+                          const int bh, const int row_num HIGHBD_DECL_SUFFIX)
+{
+    const int rows = 1 + (data->overlap_flag && row_num > 0);
+    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+    const int grain_ctr = 128 << bitdepth_min_8;
+    const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
+
+    int min_value, max_value;
+    if (data->clip_to_restricted_range) {
+        min_value = 16 << bitdepth_min_8;
+        max_value = 235 << bitdepth_min_8;
+    } else {
+        min_value = 0;
+        max_value = BITDEPTH_MAX;
+    }
+
+    // seed[0] contains the current row, seed[1] contains the previous
+    unsigned seed[2];
+    for (int i = 0; i < rows; i++) {
+        seed[i] = data->seed;
+        seed[i] ^= (((row_num - i) * 37  + 178) & 0xFF) << 8;
+        seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
+    }
+
+    assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0);
+
+    int offsets[2 /* col offset */][2 /* row offset */];
+
+    // process this row in BLOCK_SIZE^2 blocks
+    for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE) {
+        const int bw = imin(BLOCK_SIZE, (int) pw - bx);
+
+        if (data->overlap_flag && bx) {
+            // shift previous offsets left
+            for (int i = 0; i < rows; i++)
+                offsets[1][i] = offsets[0][i];
+        }
+
+        // update current offsets
+        for (int i = 0; i < rows; i++)
+            offsets[0][i] = get_random_number(8, &seed[i]);
+
+        // x/y block offsets to compensate for overlapped regions
+        const int ystart = data->overlap_flag && row_num ? imin(2, bh) : 0;
+        const int xstart = data->overlap_flag && bx      ? imin(2, bw) : 0;
+
+        static const int w[2][2] = { { 27, 17 }, { 17, 27 } };
+
+#define add_noise_y(x, y, grain)                                                  \
+        const pixel *const src = src_row + (y) * PXSTRIDE(stride) + (x) + bx;     \
+        pixel *const dst = dst_row + (y) * PXSTRIDE(stride) + (x) + bx;           \
+        const int noise = round2(scaling[ *src ] * (grain), data->scaling_shift); \
+        *dst = iclip(*src + noise, min_value, max_value);
+
+        for (int y = ystart; y < bh; y++) {
+            // Non-overlapped image region (straightforward)
+            for (int x = xstart; x < bw; x++) {
+                int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
+                add_noise_y(x, y, grain);
+            }
+
+            // Special case for overlapped column
+            for (int x = 0; x < xstart; x++) {
+                int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
+                int old   = sample_lut(grain_lut, offsets, 0, 0, 1, 0, x, y);
+                grain = round2(old * w[x][0] + grain * w[x][1], 5);
+                grain = iclip(grain, grain_min, grain_max);
+                add_noise_y(x, y, grain);
+            }
+        }
+
+        for (int y = 0; y < ystart; y++) {
+            // Special case for overlapped row (sans corner)
+            for (int x = xstart; x < bw; x++) {
+                int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
+                int old   = sample_lut(grain_lut, offsets, 0, 0, 0, 1, x, y);
+                grain = round2(old * w[y][0] + grain * w[y][1], 5);
+                grain = iclip(grain, grain_min, grain_max);
+                add_noise_y(x, y, grain);
+            }
+
+            // Special case for doubly-overlapped corner
+            for (int x = 0; x < xstart; x++) {
+                // Blend the top pixel with the top left block
+                int top = sample_lut(grain_lut, offsets, 0, 0, 0, 1, x, y);
+                int old = sample_lut(grain_lut, offsets, 0, 0, 1, 1, x, y);
+                top = round2(old * w[x][0] + top * w[x][1], 5);
+                top = iclip(top, grain_min, grain_max);
+
+                // Blend the current pixel with the left block
+                int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
+                old = sample_lut(grain_lut, offsets, 0, 0, 1, 0, x, y);
+                grain = round2(old * w[x][0] + grain * w[x][1], 5);
+                grain = iclip(grain, grain_min, grain_max);
+
+                // Mix the row rows together and apply grain
+                grain = round2(top * w[y][0] + grain * w[y][1], 5);
+                grain = iclip(grain, grain_min, grain_max);
+                add_noise_y(x, y, grain);
+            }
+        }
+    }
+}
+
+static NOINLINE void
+fguv_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
+               const ptrdiff_t stride, const Dav1dFilmGrainData *const data,
+               const int pw, const uint8_t scaling[SCALING_SIZE],
+               const entry grain_lut[][GRAIN_WIDTH], const int bh,
+               const int row_num, const pixel *const luma_row,
+               const ptrdiff_t luma_stride, const int uv, const int is_id,
+               const int sx, const int sy HIGHBD_DECL_SUFFIX)
+{
+    const int rows = 1 + (data->overlap_flag && row_num > 0);
+    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+    const int grain_ctr = 128 << bitdepth_min_8;
+    const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
+
+    int min_value, max_value;
+    if (data->clip_to_restricted_range) {
+        min_value = 16 << bitdepth_min_8;
+        max_value = (is_id ? 235 : 240) << bitdepth_min_8;
+    } else {
+        min_value = 0;
+        max_value = BITDEPTH_MAX;
+    }
+
+    // seed[0] contains the current row, seed[1] contains the previous
+    unsigned seed[2];
+    for (int i = 0; i < rows; i++) {
+        seed[i] = data->seed;
+        seed[i] ^= (((row_num - i) * 37  + 178) & 0xFF) << 8;
+        seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
+    }
+
+    assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0);
+
+    int offsets[2 /* col offset */][2 /* row offset */];
+
+    // process this row in BLOCK_SIZE^2 blocks (subsampled)
+    for (int bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) {
+        const int bw = imin(BLOCK_SIZE >> sx, pw - bx);
+        if (data->overlap_flag && bx) {
+            // shift previous offsets left
+            for (int i = 0; i < rows; i++)
+                offsets[1][i] = offsets[0][i];
+        }
+
+        // update current offsets
+        for (int i = 0; i < rows; i++)
+            offsets[0][i] = get_random_number(8, &seed[i]);
+
+        // x/y block offsets to compensate for overlapped regions
+        const int ystart = data->overlap_flag && row_num ? imin(2 >> sy, bh) : 0;
+        const int xstart = data->overlap_flag && bx      ? imin(2 >> sx, bw) : 0;
+
+        static const int w[2 /* sub */][2 /* off */][2] = {
+            { { 27, 17 }, { 17, 27 } },
+            { { 23, 22 } },
+        };
+
+#define add_noise_uv(x, y, grain)                                                    \
+            const int lx = (bx + x) << sx;                                           \
+            const int ly = y << sy;                                                  \
+            const pixel *const luma = luma_row + ly * PXSTRIDE(luma_stride) + lx;    \
+            pixel avg = luma[0];                                                     \
+            if (sx)                                                                  \
+                avg = (avg + luma[1] + 1) >> 1;                                      \
+            const pixel *const src = src_row + (y) * PXSTRIDE(stride) + (bx + (x));  \
+            pixel *const dst = dst_row + (y) * PXSTRIDE(stride) + (bx + (x));        \
+            int val = avg;                                                           \
+            if (!data->chroma_scaling_from_luma) {                                   \
+                const int combined = avg * data->uv_luma_mult[uv] +                  \
+                               *src * data->uv_mult[uv];                             \
+                val = iclip_pixel( (combined >> 6) +                                 \
+                                   (data->uv_offset[uv] * (1 << bitdepth_min_8)) );  \
+            }                                                                        \
+            const int noise = round2(scaling[ val ] * (grain), data->scaling_shift); \
+            *dst = iclip(*src + noise, min_value, max_value);
+
+        for (int y = ystart; y < bh; y++) {
+            // Non-overlapped image region (straightforward)
+            for (int x = xstart; x < bw; x++) {
+                int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
+                add_noise_uv(x, y, grain);
+            }
+
+            // Special case for overlapped column
+            for (int x = 0; x < xstart; x++) {
+                int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
+                int old   = sample_lut(grain_lut, offsets, sx, sy, 1, 0, x, y);
+                grain = round2(old * w[sx][x][0] + grain * w[sx][x][1], 5);
+                grain = iclip(grain, grain_min, grain_max);
+                add_noise_uv(x, y, grain);
+            }
+        }
+
+        for (int y = 0; y < ystart; y++) {
+            // Special case for overlapped row (sans corner)
+            for (int x = xstart; x < bw; x++) {
+                int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
+                int old   = sample_lut(grain_lut, offsets, sx, sy, 0, 1, x, y);
+                grain = round2(old * w[sy][y][0] + grain * w[sy][y][1], 5);
+                grain = iclip(grain, grain_min, grain_max);
+                add_noise_uv(x, y, grain);
+            }
+
+            // Special case for doubly-overlapped corner
+            for (int x = 0; x < xstart; x++) {
+                // Blend the top pixel with the top left block
+                int top = sample_lut(grain_lut, offsets, sx, sy, 0, 1, x, y);
+                int old = sample_lut(grain_lut, offsets, sx, sy, 1, 1, x, y);
+                top = round2(old * w[sx][x][0] + top * w[sx][x][1], 5);
+                top = iclip(top, grain_min, grain_max);
+
+                // Blend the current pixel with the left block
+                int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
+                old = sample_lut(grain_lut, offsets, sx, sy, 1, 0, x, y);
+                grain = round2(old * w[sx][x][0] + grain * w[sx][x][1], 5);
+                grain = iclip(grain, grain_min, grain_max);
+
+                // Mix the row rows together and apply to image
+                grain = round2(top * w[sy][y][0] + grain * w[sy][y][1], 5);
+                grain = iclip(grain, grain_min, grain_max);
+                add_noise_uv(x, y, grain);
+            }
+        }
+    }
+}
+
+#define fguv_ss_fn(nm, ss_x, ss_y) \
+static decl_fguv_32x32xn_fn(fguv_32x32xn_##nm##_c) { \
+    fguv_32x32xn_c(dst_row, src_row, stride, data, pw, scaling, grain_lut, bh, \
+                   row_num, luma_row, luma_stride, uv_pl, is_id, ss_x, ss_y \
+                   HIGHBD_TAIL_SUFFIX); \
+}
+
+fguv_ss_fn(420, 1, 1);
+fguv_ss_fn(422, 1, 0);
+fguv_ss_fn(444, 0, 0);
+
+COLD void bitfn(dav1d_film_grain_dsp_init)(Dav1dFilmGrainDSPContext *const c) {
+    c->generate_grain_y = generate_grain_y_c;
+    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = generate_grain_uv_420_c;
+    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = generate_grain_uv_422_c;
+    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = generate_grain_uv_444_c;
+
+    c->fgy_32x32xn = fgy_32x32xn_c;
+    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_c;
+    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_c;
+    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_c;
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+    bitfn(dav1d_film_grain_dsp_init_arm)(c);
+#elif ARCH_X86
+    bitfn(dav1d_film_grain_dsp_init_x86)(c);
+#endif
+#endif
+}
diff --git a/third_party/dav1d/src/internal.h b/third_party/dav1d/src/internal.h
index 8edb80f21ce3e..eceda98eca4da 100644
--- a/third_party/dav1d/src/internal.h
+++ b/third_party/dav1d/src/internal.h
@@ -43,7 +43,7 @@ typedef struct Dav1dTask Dav1dTask;
 #include "src/cdf.h"
 #include "src/data.h"
 #include "src/env.h"
-#include "src/film_grain.h"
+#include "src/filmgrain.h"
 #include "src/intra_edge.h"
 #include "src/ipred.h"
 #include "src/itx.h"
@@ -73,6 +73,22 @@ struct Dav1dTileGroup {
     int start, end;
 };
 
+enum TaskType {
+    DAV1D_TASK_TYPE_INIT,
+    DAV1D_TASK_TYPE_INIT_CDF,
+    DAV1D_TASK_TYPE_TILE_ENTROPY,
+    DAV1D_TASK_TYPE_ENTROPY_PROGRESS,
+    DAV1D_TASK_TYPE_TILE_RECONSTRUCTION,
+    DAV1D_TASK_TYPE_DEBLOCK_COLS,
+    DAV1D_TASK_TYPE_DEBLOCK_ROWS,
+    DAV1D_TASK_TYPE_CDEF,
+    DAV1D_TASK_TYPE_SUPER_RESOLUTION,
+    DAV1D_TASK_TYPE_LOOP_RESTORATION,
+    DAV1D_TASK_TYPE_RECONSTRUCTION_PROGRESS,
+    DAV1D_TASK_TYPE_FG_PREP,
+    DAV1D_TASK_TYPE_FG_APPLY,
+};
+
 struct Dav1dContext {
     Dav1dFrameContext *fc;
     unsigned n_fc;
@@ -123,6 +139,24 @@ struct Dav1dContext {
         // See src/thread_task.c:reset_task_cur().
         atomic_uint reset_task_cur;
         atomic_int cond_signaled;
+        struct {
+            int exec;
+            pthread_cond_t cond;
+            const Dav1dPicture *in;
+            Dav1dPicture *out;
+            enum TaskType type;
+            atomic_int progress[2]; /* [0]=started, [1]=completed */
+            union {
+                struct {
+                    ALIGN(int8_t grain_lut_8bpc[3][GRAIN_HEIGHT + 1][GRAIN_WIDTH], 16);
+                    ALIGN(uint8_t scaling_8bpc[3][256], 64);
+                };
+                struct {
+                    ALIGN(int16_t grain_lut_16bpc[3][GRAIN_HEIGHT + 1][GRAIN_WIDTH], 16);
+                    ALIGN(uint8_t scaling_16bpc[3][4096], 64);
+                };
+            };
+        } delayed_fg;
         int inited;
     } task_thread;
 
@@ -155,6 +189,7 @@ struct Dav1dContext {
     int operating_point;
     unsigned operating_point_idc;
     int all_layers;
+    int max_spatial_id;
     unsigned frame_size_limit;
     int strict_std_compliance;
     int output_invisible_frames;
@@ -162,26 +197,14 @@ struct Dav1dContext {
     int drain;
     enum PictureFlags frame_flags;
     enum Dav1dEventFlags event_flags;
+    Dav1dDataProps cached_error_props;
+    int cached_error;
 
     Dav1dLogger logger;
 
     Dav1dMemPool *picture_pool;
 };
 
-enum TaskType {
-    DAV1D_TASK_TYPE_INIT,
-    DAV1D_TASK_TYPE_INIT_CDF,
-    DAV1D_TASK_TYPE_TILE_ENTROPY,
-    DAV1D_TASK_TYPE_ENTROPY_PROGRESS,
-    DAV1D_TASK_TYPE_TILE_RECONSTRUCTION,
-    DAV1D_TASK_TYPE_DEBLOCK_COLS,
-    DAV1D_TASK_TYPE_DEBLOCK_ROWS,
-    DAV1D_TASK_TYPE_CDEF,
-    DAV1D_TASK_TYPE_SUPER_RESOLUTION,
-    DAV1D_TASK_TYPE_LOOP_RESTORATION,
-    DAV1D_TASK_TYPE_RECONSTRUCTION_PROGRESS,
-};
-
 struct Dav1dTask {
     unsigned frame_idx;         // frame thread id
     enum TaskType type;         // task work
diff --git a/third_party/dav1d/src/lib.c b/third_party/dav1d/src/lib.c
index 6b50a536786d2..b21a735964f23 100644
--- a/third_party/dav1d/src/lib.c
+++ b/third_party/dav1d/src/lib.c
@@ -120,7 +120,7 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
 
     pthread_attr_setstacksize(&thread_attr, stack_size);
 
-    Dav1dContext *const c = *c_out = dav1d_alloc_aligned(sizeof(*c), 32);
+    Dav1dContext *const c = *c_out = dav1d_alloc_aligned(sizeof(*c), 64);
     if (!c) goto error;
     memset(c, 0, sizeof(*c));
 
@@ -134,6 +134,8 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
     c->output_invisible_frames = s->output_invisible_frames;
     c->inloop_filters = s->inloop_filters;
 
+    dav1d_data_props_set_defaults(&c->cached_error_props);
+
     if (dav1d_mem_pool_init(&c->seq_hdr_pool) ||
         dav1d_mem_pool_init(&c->frame_hdr_pool) ||
         dav1d_mem_pool_init(&c->segmap_pool) ||
@@ -197,6 +199,11 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
             pthread_mutex_destroy(&c->task_thread.lock);
             goto error;
         }
+        if (pthread_cond_init(&c->task_thread.delayed_fg.cond, NULL)) {
+            pthread_cond_destroy(&c->task_thread.cond);
+            pthread_mutex_destroy(&c->task_thread.lock);
+            goto error;
+        }
         c->task_thread.cur = c->n_fc;
         atomic_init(&c->task_thread.reset_task_cur, UINT_MAX);
         atomic_init(&c->task_thread.cond_signaled, 0);
@@ -317,7 +324,8 @@ static int output_image(Dav1dContext *const c, Dav1dPicture *const out)
 {
     int res = 0;
 
-    Dav1dThreadPicture *const in = c->all_layers ? &c->out : &c->cache;
+    Dav1dThreadPicture *const in = (c->all_layers || !c->max_spatial_id)
+                                   ? &c->out : &c->cache;
     if (!c->apply_grain || !has_grain(&in->p)) {
         dav1d_picture_move_ref(out, &in->p);
         dav1d_thread_picture_unref(in);
@@ -327,18 +335,17 @@ static int output_image(Dav1dContext *const c, Dav1dPicture *const out)
     res = dav1d_apply_grain(c, out, &in->p);
     dav1d_thread_picture_unref(in);
 end:
-    if (!c->all_layers && c->out.p.data[0]) {
+    if (!c->all_layers && c->max_spatial_id && c->out.p.data[0]) {
         dav1d_thread_picture_move_ref(in, &c->out);
     }
     return res;
 }
 
 static int output_picture_ready(Dav1dContext *const c, const int drain) {
-    if (!c->all_layers) {
+    if (c->cached_error) return 1;
+    if (!c->all_layers && c->max_spatial_id) {
         if (c->out.p.data[0] && c->cache.p.data[0]) {
-            const unsigned spatial_mask = c->operating_point_idc >> 8;
-            const int max_spatial_id = spatial_mask ? ulog2(spatial_mask) : 0;
-            if (max_spatial_id == c->cache.p.frame_hdr->spatial_id ||
+            if (c->max_spatial_id == c->cache.p.frame_hdr->spatial_id ||
                 c->out.flags & PICTURE_FLAG_NEW_TEMPORAL_UNIT)
                 return 1;
             dav1d_thread_picture_unref(&c->cache);
@@ -377,6 +384,13 @@ static int drain_picture(Dav1dContext *const c, Dav1dPicture *const out) {
         if (++c->frame_thread.next == c->n_fc)
             c->frame_thread.next = 0;
         pthread_mutex_unlock(&c->task_thread.lock);
+        const int error = f->task_thread.retval;
+        if (error) {
+            f->task_thread.retval = 0;
+            dav1d_data_props_copy(&c->cached_error_props, &out_delayed->p.m);
+            dav1d_thread_picture_unref(out_delayed);
+            return error;
+        }
         if (out_delayed->p.data[0]) {
             const unsigned progress =
                 atomic_load_explicit(&out_delayed->progress[1],
@@ -457,6 +471,12 @@ int dav1d_get_picture(Dav1dContext *const c, Dav1dPicture *const out)
     if (res < 0)
         return res;
 
+    if (c->cached_error) {
+        const int res = c->cached_error;
+        c->cached_error = 0;
+        return res;
+    }
+
     if (output_picture_ready(c, c->n_fc == 1))
         return output_image(c, out);
 
@@ -479,33 +499,43 @@ int dav1d_apply_grain(Dav1dContext *const c, Dav1dPicture *const out,
     }
 
     int res = dav1d_picture_alloc_copy(c, out, in->p.w, in);
-    if (res < 0) {
-        dav1d_picture_unref_internal(out);
-        return res;
-    }
+    if (res < 0) goto error;
 
-    switch (out->p.bpc) {
+    if (c->n_tc > 1) {
+        dav1d_task_delayed_fg(c, out, in);
+    } else {
+        switch (out->p.bpc) {
 #if CONFIG_8BPC
-    case 8:
-        dav1d_apply_grain_8bpc(&c->dsp[0].fg, out, in);
-        break;
+        case 8:
+            dav1d_apply_grain_8bpc(&c->dsp[0].fg, out, in);
+            break;
 #endif
 #if CONFIG_16BPC
-    case 10:
-    case 12:
-        dav1d_apply_grain_16bpc(&c->dsp[(out->p.bpc >> 1) - 4].fg, out, in);
-        break;
+        case 10:
+        case 12:
+            dav1d_apply_grain_16bpc(&c->dsp[(out->p.bpc >> 1) - 4].fg, out, in);
+            break;
 #endif
-    default:
-        assert(0);
+        default: abort();
+        }
     }
 
     return 0;
+
+error:
+    dav1d_picture_unref_internal(out);
+    return res;
 }
 
 void dav1d_flush(Dav1dContext *const c) {
     dav1d_data_unref_internal(&c->in);
+    if (c->out.p.data[0])
+        dav1d_thread_picture_unref(&c->out);
+    if (c->cache.p.data[0])
+        dav1d_thread_picture_unref(&c->cache);
+
     c->drain = 0;
+    c->cached_error = 0;
 
     for (int i = 0; i < 8; i++) {
         if (c->refs[i].p.p.data[0])
@@ -525,6 +555,8 @@ void dav1d_flush(Dav1dContext *const c) {
     dav1d_ref_dec(&c->content_light_ref);
     dav1d_ref_dec(&c->itut_t35_ref);
 
+    dav1d_data_props_unref_internal(&c->cached_error_props);
+
     if (c->n_fc == 1 && c->n_tc == 1) return;
     atomic_store(c->flush, 1);
 
@@ -556,6 +588,7 @@ void dav1d_flush(Dav1dContext *const c) {
             Dav1dFrameContext *const f = &c->fc[next];
             dav1d_decode_frame_exit(f, -1);
             f->n_tile_data = 0;
+            f->task_thread.retval = 0;
             Dav1dThreadPicture *out_delayed = &c->frame_thread.out_delayed[next];
             if (out_delayed->p.data[0]) {
                 dav1d_thread_picture_unref(out_delayed);
@@ -592,6 +625,7 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
                 pthread_cond_destroy(&pf->task_thread.td.cond);
                 pthread_mutex_destroy(&pf->task_thread.td.lock);
             }
+            pthread_cond_destroy(&ttd->delayed_fg.cond);
             pthread_cond_destroy(&ttd->cond);
             pthread_mutex_destroy(&ttd->lock);
         }
@@ -631,7 +665,6 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
         dav1d_free_aligned(f->lf.lr_line_buf);
     }
     dav1d_free_aligned(c->fc);
-    dav1d_data_unref_internal(&c->in);
     if (c->n_fc > 1 && c->frame_thread.out_delayed) {
         for (unsigned n = 0; n < c->n_fc; n++)
             if (c->frame_thread.out_delayed[n].p.data[0])
@@ -674,6 +707,17 @@ int dav1d_get_event_flags(Dav1dContext *const c, enum Dav1dEventFlags *const fla
     return 0;
 }
 
+int dav1d_get_decode_error_data_props(Dav1dContext *const c, Dav1dDataProps *const out) {
+    validate_input_or_ret(c != NULL, DAV1D_ERR(EINVAL));
+    validate_input_or_ret(out != NULL, DAV1D_ERR(EINVAL));
+
+    dav1d_data_props_unref_internal(out);
+    *out = c->cached_error_props;
+    dav1d_data_props_set_defaults(&c->cached_error_props);
+
+    return 0;
+}
+
 void dav1d_picture_unref(Dav1dPicture *const p) {
     dav1d_picture_unref_internal(p);
 }
@@ -706,3 +750,7 @@ int dav1d_data_wrap_user_data(Dav1dData *const buf,
 void dav1d_data_unref(Dav1dData *const buf) {
     dav1d_data_unref_internal(buf);
 }
+
+void dav1d_data_props_unref(Dav1dDataProps *const props) {
+    dav1d_data_props_unref_internal(props);
+}
diff --git a/third_party/dav1d/src/meson.build b/third_party/dav1d/src/meson.build
index 9095f0ba97a94..b06aee6d70d84 100644
--- a/third_party/dav1d/src/meson.build
+++ b/third_party/dav1d/src/meson.build
@@ -58,7 +58,7 @@ libdav1d_tmpl_sources = files(
     'cdef_apply_tmpl.c',
     'cdef_tmpl.c',
     'fg_apply_tmpl.c',
-    'film_grain_tmpl.c',
+    'filmgrain_tmpl.c',
     'ipred_prepare_tmpl.c',
     'ipred_tmpl.c',
     'itx_tmpl.c',
@@ -96,7 +96,7 @@ if is_asm_enabled
         )
         libdav1d_tmpl_sources += files(
             'arm/cdef_init_tmpl.c',
-            'arm/film_grain_init_tmpl.c',
+            'arm/filmgrain_init_tmpl.c',
             'arm/ipred_init_tmpl.c',
             'arm/itx_init_tmpl.c',
             'arm/loopfilter_init_tmpl.c',
@@ -116,7 +116,7 @@ if is_asm_enabled
             if dav1d_bitdepths.contains('8')
                 libdav1d_sources_asm += files(
                     'arm/64/cdef.S',
-                    'arm/64/film_grain.S',
+                    'arm/64/filmgrain.S',
                     'arm/64/ipred.S',
                     'arm/64/loopfilter.S',
                     'arm/64/looprestoration.S',
@@ -127,7 +127,7 @@ if is_asm_enabled
             if dav1d_bitdepths.contains('16')
                 libdav1d_sources_asm += files(
                     'arm/64/cdef16.S',
-                    'arm/64/film_grain16.S',
+                    'arm/64/filmgrain16.S',
                     'arm/64/ipred16.S',
                     'arm/64/itx16.S',
                     'arm/64/loopfilter16.S',
@@ -147,7 +147,7 @@ if is_asm_enabled
             if dav1d_bitdepths.contains('8')
                 libdav1d_sources_asm += files(
                     'arm/32/cdef.S',
-                    'arm/32/film_grain.S',
+                    'arm/32/filmgrain.S',
                     'arm/32/ipred.S',
                     'arm/32/loopfilter.S',
                     'arm/32/looprestoration.S',
@@ -158,7 +158,7 @@ if is_asm_enabled
             if dav1d_bitdepths.contains('16')
                 libdav1d_sources_asm += files(
                     'arm/32/cdef16.S',
-                    'arm/32/film_grain16.S',
+                    'arm/32/filmgrain16.S',
                     'arm/32/ipred16.S',
                     'arm/32/itx16.S',
                     'arm/32/loopfilter16.S',
@@ -183,7 +183,7 @@ if is_asm_enabled
 
         libdav1d_tmpl_sources += files(
             'x86/cdef_init_tmpl.c',
-            'x86/film_grain_init_tmpl.c',
+            'x86/filmgrain_init_tmpl.c',
             'x86/ipred_init_tmpl.c',
             'x86/itx_init_tmpl.c',
             'x86/loopfilter_init_tmpl.c',
@@ -206,16 +206,17 @@ if is_asm_enabled
         if dav1d_bitdepths.contains('8')
             libdav1d_sources_asm += files(
                 'x86/cdef_avx512.asm',
+                'x86/filmgrain_avx512.asm',
                 'x86/ipred_avx512.asm',
                 'x86/itx_avx512.asm',
                 'x86/loopfilter_avx512.asm',
                 'x86/looprestoration_avx512.asm',
                 'x86/mc_avx512.asm',
-                'x86/mc_avx2.asm',
-                'x86/film_grain_avx2.asm',
+                'x86/filmgrain_avx2.asm',
                 'x86/ipred_avx2.asm',
                 'x86/loopfilter_avx2.asm',
-                'x86/film_grain_sse.asm',
+                'x86/mc_avx2.asm',
+                'x86/filmgrain_sse.asm',
                 'x86/ipred_sse.asm',
                 'x86/loopfilter_sse.asm',
                 'x86/looprestoration_sse.asm',
@@ -225,17 +226,18 @@ if is_asm_enabled
 
         if dav1d_bitdepths.contains('16')
             libdav1d_sources_asm += files(
+                'x86/ipred16_avx512.asm',
                 'x86/looprestoration16_avx512.asm',
                 'x86/mc16_avx512.asm',
                 'x86/cdef16_avx2.asm',
-                'x86/film_grain16_avx2.asm',
+                'x86/filmgrain16_avx2.asm',
                 'x86/ipred16_avx2.asm',
                 'x86/itx16_avx2.asm',
                 'x86/loopfilter16_avx2.asm',
                 'x86/looprestoration16_avx2.asm',
                 'x86/mc16_avx2.asm',
                 'x86/cdef16_sse.asm',
-                'x86/film_grain16_sse.asm',
+                'x86/filmgrain16_sse.asm',
                 'x86/ipred16_sse.asm',
                 'x86/itx16_sse.asm',
                 'x86/loopfilter16_sse.asm',
diff --git a/third_party/dav1d/src/obu.c b/third_party/dav1d/src/obu.c
index dee6e13de913d..7df6850a8c392 100644
--- a/third_party/dav1d/src/obu.c
+++ b/third_party/dav1d/src/obu.c
@@ -135,15 +135,18 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
                 op->initial_display_delay = dav1d_get_bits(gb, 4) + 1;
             }
         }
-        const int op_idx =
-            c->operating_point < hdr->num_operating_points ? c->operating_point : 0;
-        c->operating_point_idc = hdr->operating_points[op_idx].idc;
 #if DEBUG_SEQ_HDR
         printf("SEQHDR: post-operating-points: off=%u\n",
                dav1d_get_bits_pos(gb) - init_bit_pos);
 #endif
     }
 
+    const int op_idx =
+        c->operating_point < hdr->num_operating_points ? c->operating_point : 0;
+    c->operating_point_idc = hdr->operating_points[op_idx].idc;
+    const unsigned spatial_mask = c->operating_point_idc >> 8;
+    c->max_spatial_id = spatial_mask ? ulog2(spatial_mask) : 0;
+
     hdr->width_n_bits = dav1d_get_bits(gb, 4) + 1;
     hdr->height_n_bits = dav1d_get_bits(gb, 4) + 1;
     hdr->max_width = dav1d_get_bits(gb, hdr->width_n_bits) + 1;
@@ -383,7 +386,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
         if (seqhdr->frame_id_numbers_present) {
             hdr->frame_id = dav1d_get_bits(gb, seqhdr->frame_id_n_bits);
             Dav1dFrameHeader *const ref_frame_hdr = c->refs[hdr->existing_frame_idx].p.p.frame_hdr;
-            if (!ref_frame_hdr || ref_frame_hdr->frame_id != hdr->frame_id) return DAV1D_ERR(EINVAL);
+            if (!ref_frame_hdr || ref_frame_hdr->frame_id != hdr->frame_id) goto error;
         }
         return 0;
     }
@@ -767,7 +770,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
             // segmentation data from the reference frame.
             assert(hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE);
             const int pri_ref = hdr->refidx[hdr->primary_ref_frame];
-            if (!c->refs[pri_ref].p.p.frame_hdr) return DAV1D_ERR(EINVAL);
+            if (!c->refs[pri_ref].p.p.frame_hdr) goto error;
             hdr->segmentation.seg_data =
                 c->refs[pri_ref].p.p.frame_hdr->segmentation.seg_data;
         }
@@ -829,7 +832,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
             hdr->loopfilter.mode_ref_deltas = default_mode_ref_deltas;
         } else {
             const int ref = hdr->refidx[hdr->primary_ref_frame];
-            if (!c->refs[ref].p.p.frame_hdr) return DAV1D_ERR(EINVAL);
+            if (!c->refs[ref].p.p.frame_hdr) goto error;
             hdr->loopfilter.mode_ref_deltas =
                 c->refs[ref].p.p.frame_hdr->loopfilter.mode_ref_deltas;
         }
@@ -932,7 +935,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
         int off_after = -1;
         int off_before_idx, off_after_idx;
         for (int i = 0; i < 7; i++) {
-            if (!c->refs[hdr->refidx[i]].p.p.data[0]) return DAV1D_ERR(EINVAL);
+            if (!c->refs[hdr->refidx[i]].p.p.data[0]) goto error;
             const unsigned refpoc = c->refs[hdr->refidx[i]].p.p.frame_hdr->frame_offset;
 
             const int diff = get_poc_diff(seqhdr->order_hint_n_bits, refpoc, poc);
@@ -960,7 +963,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
             unsigned off_before2 = 0xFFFFFFFFU;
             int off_before2_idx;
             for (int i = 0; i < 7; i++) {
-                if (!c->refs[hdr->refidx[i]].p.p.data[0]) return DAV1D_ERR(EINVAL);
+                if (!c->refs[hdr->refidx[i]].p.p.data[0]) goto error;
                 const unsigned refpoc = c->refs[hdr->refidx[i]].p.p.frame_hdr->frame_offset;
                 if (get_poc_diff(seqhdr->order_hint_n_bits,
                                  refpoc, off_before) < 0) {
@@ -1015,7 +1018,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
                 ref_gmv = &dav1d_default_wm_params;
             } else {
                 const int pri_ref = hdr->refidx[hdr->primary_ref_frame];
-                if (!c->refs[pri_ref].p.p.frame_hdr) return DAV1D_ERR(EINVAL);
+                if (!c->refs[pri_ref].p.p.frame_hdr) goto error;
                 ref_gmv = &c->refs[pri_ref].p.p.frame_hdr->gmv[i];
             }
             int32_t *const mat = hdr->gmv[i].matrix;
@@ -1245,11 +1248,11 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
         memset(seq_hdr, 0, sizeof(*seq_hdr));
         if ((res = parse_seq_hdr(c, &gb, seq_hdr)) < 0) {
             dav1d_ref_dec(&ref);
-            return res;
+            goto error;
         }
         if (check_for_overrun(c, &gb, init_bit_pos, len)) {
             dav1d_ref_dec(&ref);
-            return DAV1D_ERR(EINVAL);
+            goto error;
         }
         // If we have read a sequence header which is different from
         // the old one, this is a new video sequence and can't use any
@@ -1307,7 +1310,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
         c->frame_hdr->spatial_id = spatial_id;
         if ((res = parse_frame_hdr(c, &gb)) < 0) {
             c->frame_hdr = NULL;
-            return res;
+            goto error;
         }
         for (int n = 0; n < c->n_tile_data; n++)
             dav1d_data_unref_internal(&c->tile[n].data);
@@ -1319,7 +1322,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
             dav1d_get_bits(&gb, 1);
             if (check_for_overrun(c, &gb, init_bit_pos, len)) {
                 c->frame_hdr = NULL;
-                return DAV1D_ERR(EINVAL);
+                goto error;
             }
         }
 
@@ -1360,7 +1363,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
         // Align to the next byte boundary and check for overrun.
         dav1d_bytealign_get_bits(&gb);
         if (check_for_overrun(c, &gb, init_bit_pos, len))
-            return DAV1D_ERR(EINVAL);
+            goto error;
         // The current bit position is a multiple of 8 (because we
         // just aligned it) and less than 8*pkt_bytelen because
         // otherwise the overrun check would have fired.
@@ -1547,7 +1550,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
 
     if (c->seq_hdr && c->frame_hdr) {
         if (c->frame_hdr->show_existing_frame) {
-            if (!c->refs[c->frame_hdr->existing_frame_idx].p.p.data[0]) return DAV1D_ERR(EINVAL);
+            if (!c->refs[c->frame_hdr->existing_frame_idx].p.p.data[0]) goto error;
             if (c->n_fc == 1) {
                 dav1d_thread_picture_ref(&c->out,
                                          &c->refs[c->frame_hdr->existing_frame_idx].p);
@@ -1574,7 +1577,13 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
                     if (c->task_thread.cur && c->task_thread.cur < c->n_fc)
                         c->task_thread.cur--;
                 }
-                if (out_delayed->p.data[0]) {
+                const int error = f->task_thread.retval;
+                if (error) {
+                    c->cached_error = error;
+                    f->task_thread.retval = 0;
+                    dav1d_data_props_copy(&c->cached_error_props, &out_delayed->p.m);
+                    dav1d_thread_picture_unref(out_delayed);
+                } else if (out_delayed->p.data[0]) {
                     const unsigned progress = atomic_load_explicit(&out_delayed->progress[1],
                                                                    memory_order_relaxed);
                     if ((out_delayed->visible || c->output_invisible_frames) &&
@@ -1613,7 +1622,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
             c->frame_hdr = NULL;
         } else if (c->n_tiles == c->frame_hdr->tiling.cols * c->frame_hdr->tiling.rows) {
             if (!c->n_tile_data)
-                return DAV1D_ERR(EINVAL);
+                goto error;
             if ((res = dav1d_submit_frame(c)) < 0)
                 return res;
             assert(!c->n_tile_data);
@@ -1625,6 +1634,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
     return len + init_byte_pos;
 
 error:
+    dav1d_data_props_copy(&c->cached_error_props, &in->m);
     dav1d_log(c, "Error parsing OBU data\n");
     return DAV1D_ERR(EINVAL);
 }
diff --git a/third_party/dav1d/src/picture.c b/third_party/dav1d/src/picture.c
index 461c9d0522b37..bebc4dd9c17b1 100644
--- a/third_party/dav1d/src/picture.c
+++ b/third_party/dav1d/src/picture.c
@@ -283,6 +283,7 @@ void dav1d_picture_unref_internal(Dav1dPicture *const p) {
         dav1d_ref_dec(&p->itut_t35_ref);
     }
     memset(p, 0, sizeof(*p));
+    dav1d_data_props_set_defaults(&p->m);
 }
 
 void dav1d_thread_picture_unref(Dav1dThreadPicture *const p) {
diff --git a/third_party/dav1d/src/tables.c b/third_party/dav1d/src/tables.c
index 92b8c4b5a8dcc..9752f15c40df2 100644
--- a/third_party/dav1d/src/tables.c
+++ b/third_party/dav1d/src/tables.c
@@ -756,7 +756,7 @@ const uint16_t dav1d_dr_intra_derivative[44] = {
     [1*idx+32] = f4, [1*idx+40] = f5,      \
     [1*idx+48] = f6
 #endif
-const int8_t ALIGN(dav1d_filter_intra_taps[5][64], 16) = {
+const int8_t ALIGN(dav1d_filter_intra_taps[5][64], 64) = {
     {
         F( 0,  -6, 10,  0,  0,  0, 12,  0 ),
         F( 1,  -5,  2, 10,  0,  0,  9,  0 ),
diff --git a/third_party/dav1d/src/thread_task.c b/third_party/dav1d/src/thread_task.c
index bcda3182894a7..53aa41e5c8ad7 100644
--- a/third_party/dav1d/src/thread_task.c
+++ b/third_party/dav1d/src/thread_task.c
@@ -30,6 +30,7 @@
 #include "common/frame.h"
 
 #include "src/thread_task.h"
+#include "src/fg_apply.h"
 
 // This function resets the cur pointer to the first frame theoretically
 // executable after a task completed (ie. each time we update some progress or
@@ -281,6 +282,22 @@ void dav1d_task_frame_init(Dav1dFrameContext *const f) {
     insert_task(f, t, 1);
 }
 
+void dav1d_task_delayed_fg(Dav1dContext *const c, Dav1dPicture *const out,
+                           const Dav1dPicture *const in)
+{
+    struct TaskThreadData *const ttd = &c->task_thread;
+    ttd->delayed_fg.in = in;
+    ttd->delayed_fg.out = out;
+    ttd->delayed_fg.type = DAV1D_TASK_TYPE_FG_PREP;
+    atomic_init(&ttd->delayed_fg.progress[0], 0);
+    atomic_init(&ttd->delayed_fg.progress[1], 0);
+    pthread_mutex_lock(&ttd->lock);
+    ttd->delayed_fg.exec = 1;
+    pthread_cond_signal(&ttd->cond);
+    pthread_cond_wait(&ttd->delayed_fg.cond, &ttd->lock);
+    pthread_mutex_unlock(&ttd->lock);
+}
+
 static inline int ensure_progress(struct TaskThreadData *const ttd,
                                   Dav1dFrameContext *const f,
                                   Dav1dTask *const t, const enum TaskType type,
@@ -352,18 +369,104 @@ static inline int check_tile(Dav1dTask *const t, Dav1dFrameContext *const f,
     return 0;
 }
 
-static inline void abort_frame(Dav1dFrameContext *const f) {
-    atomic_store(&f->task_thread.error, 1);
+static inline void abort_frame(Dav1dFrameContext *const f, const int error) {
+    atomic_store(&f->task_thread.error, error == DAV1D_ERR(EINVAL) ? 1 : -1);
     f->task_thread.task_counter = 0;
     f->task_thread.done[0] = 1;
     f->task_thread.done[1] = 1;
     atomic_store(&f->sr_cur.progress[0], FRAME_ERROR);
     atomic_store(&f->sr_cur.progress[1], FRAME_ERROR);
-    dav1d_decode_frame_exit(f, -1);
+    dav1d_decode_frame_exit(f, error);
     f->n_tile_data = 0;
     pthread_cond_signal(&f->task_thread.cond);
 }
 
+static inline void delayed_fg_task(const Dav1dContext *const c,
+                                   struct TaskThreadData *const ttd)
+{
+    const Dav1dPicture *const in = ttd->delayed_fg.in;
+    Dav1dPicture *const out = ttd->delayed_fg.out;
+#if CONFIG_16BPC
+    int off;
+    if (out->p.bpc != 8)
+        off = (out->p.bpc >> 1) - 4;
+#endif
+    switch (ttd->delayed_fg.type) {
+    case DAV1D_TASK_TYPE_FG_PREP:
+        ttd->delayed_fg.exec = 0;
+        if (atomic_load(&ttd->cond_signaled))
+            pthread_cond_signal(&ttd->cond);
+        pthread_mutex_unlock(&ttd->lock);
+        switch (out->p.bpc) {
+#if CONFIG_8BPC
+        case 8:
+            dav1d_prep_grain_8bpc(&c->dsp[0].fg, out, in,
+                                  ttd->delayed_fg.scaling_8bpc,
+                                  ttd->delayed_fg.grain_lut_8bpc);
+            break;
+#endif
+#if CONFIG_16BPC
+        case 10:
+        case 12:
+            dav1d_prep_grain_16bpc(&c->dsp[off].fg, out, in,
+                                   ttd->delayed_fg.scaling_16bpc,
+                                   ttd->delayed_fg.grain_lut_16bpc);
+            break;
+#endif
+        default: abort();
+        }
+        ttd->delayed_fg.type = DAV1D_TASK_TYPE_FG_APPLY;
+        pthread_mutex_lock(&ttd->lock);
+        ttd->delayed_fg.exec = 1;
+        // fall-through
+    case DAV1D_TASK_TYPE_FG_APPLY:;
+        int row = atomic_fetch_add(&ttd->delayed_fg.progress[0], 1);
+        pthread_mutex_unlock(&ttd->lock);
+        int progmax = (out->p.h + 31) >> 5;
+    fg_apply_loop:
+        if (row + 1 < progmax)
+            pthread_cond_signal(&ttd->cond);
+        else if (row + 1 >= progmax) {
+            pthread_mutex_lock(&ttd->lock);
+            ttd->delayed_fg.exec = 0;
+            if (row >= progmax) goto end_add;
+            pthread_mutex_unlock(&ttd->lock);
+        }
+        switch (out->p.bpc) {
+#if CONFIG_8BPC
+        case 8:
+            dav1d_apply_grain_row_8bpc(&c->dsp[0].fg, out, in,
+                                       ttd->delayed_fg.scaling_8bpc,
+                                       ttd->delayed_fg.grain_lut_8bpc, row);
+            break;
+#endif
+#if CONFIG_16BPC
+        case 10:
+        case 12:
+            dav1d_apply_grain_row_16bpc(&c->dsp[off].fg, out, in,
+                                        ttd->delayed_fg.scaling_16bpc,
+                                        ttd->delayed_fg.grain_lut_16bpc, row);
+            break;
+#endif
+        default: abort();
+        }
+        row = atomic_fetch_add(&ttd->delayed_fg.progress[0], 1);
+        int done = atomic_fetch_add(&ttd->delayed_fg.progress[1], 1) + 1;
+        if (row < progmax) goto fg_apply_loop;
+        pthread_mutex_lock(&ttd->lock);
+        ttd->delayed_fg.exec = 0;
+    end_add:
+        done = atomic_fetch_add(&ttd->delayed_fg.progress[1], 1) + 1;
+        progmax = atomic_load(&ttd->delayed_fg.progress[0]);
+        // signal for completion only once the last runner reaches this
+        if (done < progmax)
+            break;
+        pthread_cond_signal(&ttd->delayed_fg.cond);
+        break;
+    default: abort();
+    }
+}
+
 void *dav1d_worker_task(void *data) {
     Dav1dTaskContext *const tc = data;
     const Dav1dContext *const c = tc->c;
@@ -373,11 +476,15 @@ void *dav1d_worker_task(void *data) {
 
     pthread_mutex_lock(&ttd->lock);
     for (;;) {
-        Dav1dFrameContext *f;
-        Dav1dTask *t, *prev_t = NULL;
         if (tc->task_thread.die) break;
         if (atomic_load(c->flush)) goto park;
-        if (c->n_fc > 1) { // run init tasks first
+        if (ttd->delayed_fg.exec) { // run delayed film grain first
+            delayed_fg_task(c, ttd);
+            continue;
+        }
+        Dav1dFrameContext *f;
+        Dav1dTask *t, *prev_t = NULL;
+        if (c->n_fc > 1) { // run init tasks second
             for (unsigned i = 0; i < c->n_fc; i++) {
                 const unsigned first = atomic_load(&ttd->first);
                 f = &c->fc[(first + i) % c->n_fc];
@@ -395,7 +502,7 @@ void *dav1d_worker_task(void *data) {
                 }
             }
         }
-        while (ttd->cur < c->n_fc) {
+        while (ttd->cur < c->n_fc) { // run decoding tasks last
             const unsigned first = atomic_load(&ttd->first);
             f = &c->fc[(first + ttd->cur) % c->n_fc];
             prev_t = f->task_thread.task_cur_prev;
@@ -497,7 +604,7 @@ void *dav1d_worker_task(void *data) {
             int p1 = f->in_cdf.progress ? atomic_load(f->in_cdf.progress) : 1;
             if (res || p1 == TILE_ERROR) {
                 pthread_mutex_lock(&ttd->lock);
-                abort_frame(f);
+                abort_frame(f, res ? res : DAV1D_ERR(EINVAL));
             } else if (!res) {
                 t->type = DAV1D_TASK_TYPE_INIT_CDF;
                 if (p1) goto found_unlocked;
@@ -509,7 +616,7 @@ void *dav1d_worker_task(void *data) {
         }
         case DAV1D_TASK_TYPE_INIT_CDF: {
             assert(c->n_fc > 1);
-            int res = -1;
+            int res = DAV1D_ERR(EINVAL);
             if (!atomic_load(&f->task_thread.error))
                 res = dav1d_decode_frame_init_cdf(f);
             pthread_mutex_lock(&ttd->lock);
@@ -523,19 +630,19 @@ void *dav1d_worker_task(void *data) {
                     if (res) {
                         // memory allocation failed
                         f->task_thread.done[2 - p] = 1;
-                        atomic_store(&f->task_thread.error, 1);
+                        atomic_store(&f->task_thread.error, -1);
                         f->task_thread.task_counter -= f->sbh +
                             f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows;
                         atomic_store(&f->sr_cur.progress[p - 1], FRAME_ERROR);
                         if (p == 2 && f->task_thread.done[1]) {
                             assert(!f->task_thread.task_counter);
-                            dav1d_decode_frame_exit(f, -1);
+                            dav1d_decode_frame_exit(f, DAV1D_ERR(ENOMEM));
                             f->n_tile_data = 0;
                             pthread_cond_signal(&f->task_thread.cond);
                         }
                     }
                 }
-            } else abort_frame(f);
+            } else abort_frame(f, res);
             reset_task_cur(c, ttd, t->frame_idx);
             f->task_thread.init_done = 1;
             continue;
@@ -588,7 +695,8 @@ void *dav1d_worker_task(void *data) {
                 if (!--f->task_thread.task_counter && f->task_thread.done[0] &&
                     (!uses_2pass || f->task_thread.done[1]))
                 {
-                    dav1d_decode_frame_exit(f, error ? -1 : 0);
+                    dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) :
+                                            error ? DAV1D_ERR(ENOMEM) : 0);
                     f->n_tile_data = 0;
                     pthread_cond_signal(&f->task_thread.cond);
                 }
@@ -703,7 +811,8 @@ void *dav1d_worker_task(void *data) {
         if (!--f->task_thread.task_counter &&
             f->task_thread.done[0] && (!uses_2pass || f->task_thread.done[1]))
         {
-            dav1d_decode_frame_exit(f, error ? -1 : 0);
+            dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) :
+                                    error ? DAV1D_ERR(ENOMEM) : 0);
             f->n_tile_data = 0;
             pthread_cond_signal(&f->task_thread.cond);
         }
diff --git a/third_party/dav1d/src/thread_task.h b/third_party/dav1d/src/thread_task.h
index 0ff2228bed7c5..257da1a470c70 100644
--- a/third_party/dav1d/src/thread_task.h
+++ b/third_party/dav1d/src/thread_task.h
@@ -39,6 +39,8 @@
 int dav1d_task_create_tile_sbrow(Dav1dFrameContext *f, int pass, int cond_signal);
 void dav1d_task_frame_init(Dav1dFrameContext *f);
 
+void dav1d_task_delayed_fg(Dav1dContext *c, Dav1dPicture *out, const Dav1dPicture *in);
+
 void *dav1d_worker_task(void *data);
 
 int dav1d_decode_frame_init(Dav1dFrameContext *f);
diff --git a/third_party/dav1d/src/x86/cdef16_avx2.asm b/third_party/dav1d/src/x86/cdef16_avx2.asm
index 9e2c7b361e0b1..4c8d3bca4377b 100644
--- a/third_party/dav1d/src/x86/cdef16_avx2.asm
+++ b/third_party/dav1d/src/x86/cdef16_avx2.asm
@@ -59,14 +59,6 @@ cextern cdef_dir_8bpc_avx2.main
 
 SECTION .text
 
-%macro REPX 2-*
-    %xdefine %%f(x) %1
-%rep %0 - 1
-    %rotate 1
-    %%f(%1)
-%endrep
-%endmacro
-
 %macro CDEF_FILTER 2 ; w, h
     DEFINE_ARGS dst, stride, _, dir, pridmp, pri, sec, tmp
     movifnidn     prid, r5m
diff --git a/third_party/dav1d/src/x86/cdef16_sse.asm b/third_party/dav1d/src/x86/cdef16_sse.asm
index 03736b422c0b9..1bd67ace64d09 100644
--- a/third_party/dav1d/src/x86/cdef16_sse.asm
+++ b/third_party/dav1d/src/x86/cdef16_sse.asm
@@ -64,14 +64,6 @@ cextern shufw_6543210x
 
 SECTION .text
 
-%macro REPX 2-*
-    %xdefine %%f(x) %1
-%rep %0 - 1
-    %rotate 1
-    %%f(%1)
-%endrep
-%endmacro
-
 %if ARCH_X86_32
 DECLARE_REG_TMP 5, 3
 %elif WIN64
diff --git a/third_party/dav1d/src/x86/film_grain16_avx2.asm b/third_party/dav1d/src/x86/filmgrain16_avx2.asm
similarity index 98%
rename from third_party/dav1d/src/x86/film_grain16_avx2.asm
rename to third_party/dav1d/src/x86/filmgrain16_avx2.asm
index 1ece90b68433b..5c2e3868a349a 100644
--- a/third_party/dav1d/src/x86/film_grain16_avx2.asm
+++ b/third_party/dav1d/src/x86/filmgrain16_avx2.asm
@@ -25,6 +25,7 @@
 
 %include "config.asm"
 %include "ext/x86/x86inc.asm"
+%include "x86/filmgrain_common.asm"
 
 %if ARCH_X86_64
 
@@ -81,38 +82,8 @@ JMP_TABLE generate_grain_uv_420_16bpc_avx2, 0, 1, 2, 3
 JMP_TABLE generate_grain_uv_422_16bpc_avx2, 0, 1, 2, 3
 JMP_TABLE generate_grain_uv_444_16bpc_avx2, 0, 1, 2, 3
 
-struc FGData
-    .seed:                      resd 1
-    .num_y_points:              resd 1
-    .y_points:                  resb 14 * 2
-    .chroma_scaling_from_luma:  resd 1
-    .num_uv_points:             resd 2
-    .uv_points:                 resb 2 * 10 * 2
-    .scaling_shift:             resd 1
-    .ar_coeff_lag:              resd 1
-    .ar_coeffs_y:               resb 24
-    .ar_coeffs_uv:              resb 2 * 28 ; includes padding
-    .ar_coeff_shift:            resq 1
-    .grain_scale_shift:         resd 1
-    .uv_mult:                   resd 2
-    .uv_luma_mult:              resd 2
-    .uv_offset:                 resd 2
-    .overlap_flag:              resd 1
-    .clip_to_restricted_range:  resd 1
-endstruc
-
-cextern gaussian_sequence
-
 SECTION .text
 
-%macro REPX 2-*
-    %xdefine %%f(x) %1
-%rep %0 - 1
-    %rotate 1
-    %%f(%1)
-%endrep
-%endmacro
-
 %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
 
 INIT_YMM avx2
diff --git a/third_party/dav1d/src/x86/film_grain16_sse.asm b/third_party/dav1d/src/x86/filmgrain16_sse.asm
similarity index 99%
rename from third_party/dav1d/src/x86/film_grain16_sse.asm
rename to third_party/dav1d/src/x86/filmgrain16_sse.asm
index 3f86e7d9a5907..6b0daaac0ba35 100644
--- a/third_party/dav1d/src/x86/film_grain16_sse.asm
+++ b/third_party/dav1d/src/x86/filmgrain16_sse.asm
@@ -25,6 +25,7 @@
 
 %include "config.asm"
 %include "ext/x86/x86inc.asm"
+%include "x86/filmgrain_common.asm"
 
 SECTION_RODATA 16
 pd_16: times 4 dd 16
@@ -66,38 +67,8 @@ JMP_TABLE generate_grain_uv_420_16bpc_ssse3, 0, 1, 2, 3
 JMP_TABLE generate_grain_uv_422_16bpc_ssse3, 0, 1, 2, 3
 JMP_TABLE generate_grain_uv_444_16bpc_ssse3, 0, 1, 2, 3
 
-struc FGData
-    .seed:                      resd 1
-    .num_y_points:              resd 1
-    .y_points:                  resb 14 * 2
-    .chroma_scaling_from_luma:  resd 1
-    .num_uv_points:             resd 2
-    .uv_points:                 resb 2 * 10 * 2
-    .scaling_shift:             resd 1
-    .ar_coeff_lag:              resd 1
-    .ar_coeffs_y:               resb 24
-    .ar_coeffs_uv:              resb 2 * 28 ; includes padding
-    .ar_coeff_shift:            resq 1
-    .grain_scale_shift:         resd 1
-    .uv_mult:                   resd 2
-    .uv_luma_mult:              resd 2
-    .uv_offset:                 resd 2
-    .overlap_flag:              resd 1
-    .clip_to_restricted_range:  resd 1
-endstruc
-
-cextern gaussian_sequence
-
 SECTION .text
 
-%macro REPX 2-*
-    %xdefine %%f(x) %1
-%rep %0 - 1
-    %rotate 1
-    %%f(%1)
-%endrep
-%endmacro
-
 %if ARCH_X86_32
 %undef base
 %define PIC_ptr(a) base+a
diff --git a/third_party/dav1d/src/x86/film_grain_avx2.asm b/third_party/dav1d/src/x86/filmgrain_avx2.asm
similarity index 98%
rename from third_party/dav1d/src/x86/film_grain_avx2.asm
rename to third_party/dav1d/src/x86/filmgrain_avx2.asm
index dda43a9baa579..7da8105dfacd0 100644
--- a/third_party/dav1d/src/x86/film_grain_avx2.asm
+++ b/third_party/dav1d/src/x86/filmgrain_avx2.asm
@@ -25,6 +25,7 @@
 
 %include "config.asm"
 %include "ext/x86/x86inc.asm"
+%include "x86/filmgrain_common.asm"
 
 %if ARCH_X86_64
 
@@ -74,38 +75,8 @@ JMP_TABLE generate_grain_uv_420, avx2, 0, 1, 2, 3
 JMP_TABLE generate_grain_uv_422, avx2, 0, 1, 2, 3
 JMP_TABLE generate_grain_uv_444, avx2, 0, 1, 2, 3
 
-struc FGData
-    .seed:                      resd 1
-    .num_y_points:              resd 1
-    .y_points:                  resb 14 * 2
-    .chroma_scaling_from_luma:  resd 1
-    .num_uv_points:             resd 2
-    .uv_points:                 resb 2 * 10 * 2
-    .scaling_shift:             resd 1
-    .ar_coeff_lag:              resd 1
-    .ar_coeffs_y:               resb 24
-    .ar_coeffs_uv:              resb 2 * 28 ; includes padding
-    .ar_coeff_shift:            resq 1
-    .grain_scale_shift:         resd 1
-    .uv_mult:                   resd 2
-    .uv_luma_mult:              resd 2
-    .uv_offset:                 resd 2
-    .overlap_flag:              resd 1
-    .clip_to_restricted_range:  resd 1
-endstruc
-
-cextern gaussian_sequence
-
 SECTION .text
 
-%macro REPX 2-*
-    %xdefine %%f(x) %1
-%rep %0 - 1
-    %rotate 1
-    %%f(%1)
-%endrep
-%endmacro
-
 INIT_YMM avx2
 cglobal generate_grain_y_8bpc, 2, 9, 8, buf, fg_data
 %define base r4-generate_grain_y_8bpc_avx2_table
@@ -1097,9 +1068,6 @@ cglobal fgy_32x32xn_8bpc, 6, 13, 15, dst, src, stride, fg_data, w, scaling, \
     jne .loop_x_hv_overlap
     jmp .loop_x_h_overlap
 
-.end:
-    RET
-
 .vertical_overlap:
     DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
                 unused, sby, see, overlap
@@ -1214,7 +1182,7 @@ cglobal fgy_32x32xn_8bpc, 6, 13, 15, dst, src, stride, fg_data, w, scaling, \
     jmp .loop_y
 .end_y_v_overlap:
     add              wq, 32
-    jge .end_hv
+    jge .end
     lea            srcq, [src_bakq+wq]
 
     ; since fg_dataq.overlap is guaranteed to be set, we never jump
@@ -1334,7 +1302,7 @@ cglobal fgy_32x32xn_8bpc, 6, 13, 15, dst, src, stride, fg_data, w, scaling, \
     add              wq, 32
     lea            srcq, [src_bakq+wq]
     jl .loop_x_hv_overlap
-.end_hv:
+.end:
     RET
 
 %macro FGUV_FN 3 ; name, ss_hor, ss_ver
@@ -1691,9 +1659,6 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
     jne %%loop_x_hv_overlap
     jmp %%loop_x_h_overlap
 
-%%end:
-    RET
-
 %%vertical_overlap:
     DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \
                 sby, see, overlap, unused1, unused2, lstride
@@ -1887,7 +1852,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
 
 %%end_y_v_overlap:
     add              wq, 32>>%2
-    jge %%end_hv
+    jge %%end
     mov            srcq, r11mp
     mov            dstq, r12mp
     lea           lumaq, [r14+wq*(1+%2)]
@@ -2116,15 +2081,14 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
 
 %%end_y_hv_overlap:
     add              wq, 32>>%2
-    jge %%end_hv
+    jge %%end
     mov            srcq, r11mp
     mov            dstq, r12mp
     lea           lumaq, [r14+wq*(1+%2)]
     add            srcq, wq
     add            dstq, wq
     jmp %%loop_x_hv_overlap
-
-%%end_hv:
+%%end:
     RET
 %endmacro
 
diff --git a/third_party/dav1d/src/x86/filmgrain_avx512.asm b/third_party/dav1d/src/x86/filmgrain_avx512.asm
new file mode 100644
index 0000000000000..6d277464a5976
--- /dev/null
+++ b/third_party/dav1d/src/x86/filmgrain_avx512.asm
@@ -0,0 +1,1079 @@
+; Copyright © 2022, VideoLAN and dav1d authors
+; Copyright © 2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+%include "x86/filmgrain_common.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+pb_even:       db  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+               db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
+               db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94
+               db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126
+pb_odd:        db  1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+               db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
+               db 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95
+               db 97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127
+interleave_hl: db  8,  0,  9,  1, 10,  2, 11,  3, 12,  4, 13,  5, 14,  6, 15,  7
+pb_27_17_17_27:        db 27, 17, 17, 27,  0, 32,  0, 32
+pb_23_22_0_32:         db 23, 22,  0, 32,  0, 32,  0, 32
+pb_27_17:      times 2 db 27, 17
+pb_23_22:      times 2 db 23, 22
+pw_8:          times 2 dw 8
+pw_1024:       times 2 dw 1024
+pb_17_27:      times 2 db 17, 27
+fg_max:        times 4 db 255
+               times 4 db 240
+               times 4 db 235
+fg_min:        times 4 db 0
+               times 4 db 16
+noise_rnd:     times 2 dw 128
+               times 2 dw 64
+               times 2 dw 32
+               times 2 dw 16
+
+SECTION .text
+
+INIT_ZMM avx512icl
+cglobal fgy_32x32xn_8bpc, 6, 13, 22, dst, src, stride, fg_data, w, scaling, \
+                                     grain_lut, h, sby, see, overlap
+%define base r11-fg_min
+    lea             r11, [fg_min]
+    mov             r6d, [fg_dataq+FGData.scaling_shift]
+    mov             r7d, [fg_dataq+FGData.clip_to_restricted_range]
+    mov            sbyd, sbym
+    mov        overlapd, [fg_dataq+FGData.overlap_flag]
+    mov             r12, 0x0000000f0000000f ; h_overlap mask
+    mova             m0, [scalingq+64*0]
+    mova             m1, [scalingq+64*1]
+    mova             m2, [scalingq+64*2]
+    mova             m3, [scalingq+64*3]
+    kmovq            k1, r12
+    vbroadcasti32x4  m4, [base+interleave_hl]
+    vpbroadcastd   ym16, [base+pb_27_17]
+    vpbroadcastd    m12, [base+pb_17_27]
+    vpbroadcastd     m6, [base+noise_rnd+r6*4-32]
+    test           sbyd, sbyd
+    setnz           r6b
+    vpbroadcastd     m7, [base+fg_min+r7*4]
+    vpbroadcastd     m8, [base+fg_max+r7*8]
+    pxor             m5, m5
+    vpbroadcastd     m9, [base+pw_1024]
+    vpbroadcastq    m10, [base+pb_27_17_17_27]
+    vmovdqa64   m12{k1}, m16
+    test            r6b, overlapb
+    jnz .v_overlap
+
+    imul           seed, sbyd, (173 << 24) | 37
+    add            seed, (105 << 24) | 178
+    rorx           seed, seed, 24
+    movzx          seed, seew
+    xor            seed, [fg_dataq+FGData.seed]
+
+    DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
+                h, sby, see, overlap
+
+    lea        src_bakq, [srcq+wq]
+    neg              wq
+    sub            dstq, srcq
+.loop_x:
+    rorx             r6, seeq, 1
+    or             seed, 0xeff4
+    test           seeb, seeh
+    lea            seed, [r6+0x8000]
+    cmovp          seed, r6d                 ; updated seed
+    rorx          offyd, seed, 8
+    rorx          offxq, seeq, 12
+    and           offyd, 0xf
+    imul          offyd, 164
+    lea           offxd, [offyq+offxq*2+829] ; offy*stride+offx
+
+    DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
+                h, sby, see, overlap
+
+    mov      grain_lutq, grain_lutmp
+    mov              hd, hm
+.loop_y:
+    mova           ym18, [srcq+strideq*0]
+    vinserti32x8    m18, [srcq+strideq*1], 1
+    movu           ym21, [grain_lutq+offxyq-82]
+    vinserti32x8    m21, [grain_lutq+offxyq+ 0], 1
+    mova            m19, m0
+    vpmovb2m         k2, m18
+    punpcklbw       m16, m18, m5
+    punpckhbw       m17, m18, m5
+    vpermt2b        m19, m18, m1 ; scaling[  0..127]
+    vpermi2b        m18, m2, m3  ; scaling[128..255]
+    punpcklbw       m20, m5, m21 ; grain
+    punpckhbw       m21, m5
+    vmovdqu8    m19{k2}, m18     ; scaling[src]
+    pshufb          m19, m4
+    pmaddubsw       m18, m19, m20
+    pmaddubsw       m19, m21
+    add      grain_lutq, 82*2
+    pmulhrsw        m18, m6      ; noise
+    pmulhrsw        m19, m6
+    paddw           m16, m18
+    paddw           m17, m19
+    packuswb        m16, m17
+    pmaxub          m16, m7
+    pminub          m16, m8
+    mova    [dstq+srcq], ym16
+    add            srcq, strideq
+    vextracti32x8 [dstq+srcq], m16, 1
+    add            srcq, strideq
+    sub              hb, 2
+    jg .loop_y
+    add              wq, 32
+    jge .end
+    lea            srcq, [src_bakq+wq]
+    test       overlapd, overlapd
+    jz .loop_x
+    test           sbyd, sbyd
+    jnz .hv_overlap
+
+.loop_x_h_overlap:
+    rorx             r6, seeq, 1
+    or             seed, 0xeff4
+    test           seeb, seeh
+    lea            seed, [r6+0x8000]
+    cmovp          seed, r6d                 ; updated seed
+
+    DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
+                h, sby, see, left_offxy
+
+    rorx          offyd, seed, 8
+    mov     left_offxyd, offxd               ; previous column's offy*stride
+    rorx          offxq, seeq, 12
+    and           offyd, 0xf
+    imul          offyd, 164
+    lea           offxd, [offyq+offxq*2+829] ; offy*stride+offx
+
+    DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
+                h, sby, see, left_offxy
+
+    mov      grain_lutq, grain_lutmp
+    mov              hd, hm
+.loop_y_h_overlap:
+    movu           ym20, [grain_lutq+offxyq-82]
+    vinserti32x8    m20, [grain_lutq+offxyq+ 0], 1
+    movd           xm21, [grain_lutq+left_offxyq-50]
+    vinserti32x4    m21, [grain_lutq+left_offxyq+32], 2
+    mova           ym18, [srcq+strideq*0]
+    vinserti32x8    m18, [srcq+strideq*1], 1
+    mova            m19, m0
+    punpcklbw       m21, m20
+    vpmovb2m         k2, m18
+    punpcklbw       m16, m18, m5
+    punpckhbw       m17, m18, m5
+    pmaddubsw       m21, m10, m21
+    vpermt2b        m19, m18, m1
+    vpermi2b        m18, m2, m3
+    pmulhrsw        m21, m9
+    vmovdqu8    m19{k2}, m18     ; scaling[src]
+    punpckhbw       m18, m20, m5
+    pshufb          m19, m4
+    packsswb    m20{k1}, m21, m21
+    punpcklbw       m20, m5, m20 ; grain
+    pmaddubsw       m18, m19, m18
+    pmaddubsw       m19, m20
+    add      grain_lutq, 82*2
+    pmulhrsw        m18, m6      ; noise
+    pmulhrsw        m19, m6
+    paddw           m17, m18
+    paddw           m16, m19
+    packuswb        m16, m17
+    pmaxub          m16, m7
+    pminub          m16, m8
+    mova    [dstq+srcq], ym16
+    add            srcq, strideq
+    vextracti32x8 [dstq+srcq], m16, 1
+    add            srcq, strideq
+    sub              hb, 2
+    jg .loop_y_h_overlap
+    add              wq, 32
+    jge .end
+    lea            srcq, [src_bakq+wq]
+    test           sbyd, sbyd
+    jnz .hv_overlap
+    jmp .loop_x_h_overlap
+
+.v_overlap:
+    DEFINE_ARGS dst, src, stride, fg_data, w, offy, offx, \
+                h, sby, see, overlap
+
+    movzx           r6d, sbyb
+    imul           seed, [fg_dataq+FGData.seed], 0x00010001
+    imul            r7d, r6d, 173 * 0x00010001
+    imul            r6d, 37 * 0x01000100
+    add             r7d, (105 << 16) | 188
+    add             r6d, (178 << 24) | (141 << 8)
+    and             r7d, 0x00ff00ff
+    and             r6d, 0xff00ff00
+    xor            seed, r7d
+    xor            seed, r6d     ; (cur_seed << 16) | top_seed
+
+    DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
+                h, sby, see, overlap
+
+    lea        src_bakq, [srcq+wq]
+    neg              wq
+    sub            dstq, srcq
+
+    ; we assume from the block above that bits 8-15 of r7d are zero'ed
+    mov             r6d, seed
+    or             seed, 0xeff4eff4
+    test           seeb, seeh
+    setp            r7b          ; parity of top_seed
+    shr            seed, 16
+    shl             r7d, 16
+    test           seeb, seeh
+    setp            r7b          ; parity of cur_seed
+    or              r6d, 0x00010001
+    xor             r7d, r6d
+    rorx           seed, r7d, 1  ; updated (cur_seed << 16) | top_seed
+    rorx          offyd, seed, 8
+    rorx          offxd, seed, 12
+    and           offyd, 0xf000f
+    and           offxd, 0xf000f
+    imul          offyd, 164
+    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+    lea           offxd, [offyq+offxq*2+0x10001*829+32*82]
+
+    DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
+                h, sby, see, overlap, top_offxy
+
+    mov      grain_lutq, grain_lutmp
+    mov              hd, hm
+    movzx    top_offxyd, offxyw
+    shr          offxyd, 16
+    movu           ym16, [grain_lutq+offxyq-82]
+    vinserti32x8    m16, [grain_lutq+offxyq+ 0], 1
+    movu           ym21, [grain_lutq+top_offxyq-82]
+    vinserti32x8    m21, [grain_lutq+top_offxyq+ 0], 1
+    mova           ym18, [srcq+strideq*0]
+    vinserti32x8    m18, [srcq+strideq*1], 1
+    mova            m19, m0
+    punpcklbw       m20, m21, m16
+    punpckhbw       m21, m16
+    vpmovb2m         k2, m18
+    pmaddubsw       m20, m12, m20
+    pmaddubsw       m21, m12, m21
+    punpcklbw       m16, m18, m5
+    punpckhbw       m17, m18, m5
+    vpermt2b        m19, m18, m1
+    vpermi2b        m18, m2, m3
+    pmulhrsw        m20, m9
+    pmulhrsw        m21, m9
+    vmovdqu8    m19{k2}, m18     ; scaling[src]
+    pshufb          m19, m4
+    packsswb        m20, m21
+    punpcklbw       m18, m5, m20 ; grain
+    punpckhbw       m20, m5
+    pmaddubsw       m18, m19, m18
+    pmaddubsw       m19, m20
+    add      grain_lutq, 82*2
+    pmulhrsw        m18, m6      ; noise
+    pmulhrsw        m19, m6
+    paddw           m16, m18
+    paddw           m17, m19
+    packuswb        m16, m17
+    pmaxub          m16, m7
+    pminub          m16, m8
+    mova    [dstq+srcq], ym16
+    add            srcq, strideq
+    vextracti32x8 [dstq+srcq], m16, 1
+    add            srcq, strideq
+    sub              hb, 2
+    jg .loop_y
+    add              wq, 32
+    jge .end
+    lea            srcq, [src_bakq+wq]
+
+    ; since fg_dataq.overlap is guaranteed to be set, we never jump back
+    ; to .v_overlap, and instead always fall-through to h+v overlap
+.hv_overlap:
+    ; we assume from the block above that bits 8-15 of r7d are zero'ed
+    mov             r6d, seed
+    or             seed, 0xeff4eff4
+    test           seeb, seeh
+    setp            r7b          ; parity of top_seed
+    shr            seed, 16
+    shl             r7d, 16
+    test           seeb, seeh
+    setp            r7b          ; parity of cur_seed
+    or              r6d, 0x00010001
+    xor             r7d, r6d
+    rorx           seed, r7d, 1  ; updated (cur_seed << 16) | top_seed
+
+    DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
+                h, sby, see, left_offxy, top_offxy, topleft_offxy
+
+    mov  topleft_offxyd, top_offxyd
+    rorx          offyd, seed, 8
+    mov     left_offxyd, offxd
+    rorx          offxd, seed, 12
+    and           offyd, 0xf000f
+    and           offxd, 0xf000f
+    imul          offyd, 164
+    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+    lea           offxd, [offyq+offxq*2+0x10001*829+32*82]
+
+    DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
+                h, sby, see, left_offxy, top_offxy, topleft_offxy
+
+    mov      grain_lutq, grain_lutmp
+    mov              hd, hm
+    movzx    top_offxyd, offxyw
+    shr          offxyd, 16
+    movu           ym19, [grain_lutq+offxyq-82]
+    vinserti32x8    m19, [grain_lutq+offxyq+ 0], 1
+    movd           xm16, [grain_lutq+left_offxyq-50]
+    vinserti32x4    m16, [grain_lutq+left_offxyq+32], 2
+    movu           ym21, [grain_lutq+top_offxyq-82]
+    vinserti32x8    m21, [grain_lutq+top_offxyq+ 0], 1
+    movd           xm17, [grain_lutq+topleft_offxyq-50]
+    vinserti32x4    m17, [grain_lutq+topleft_offxyq+32], 2
+    mova           ym18, [srcq+strideq*0]
+    vinserti32x8    m18, [srcq+strideq*1], 1
+    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+    punpcklbw       m16, m19
+    punpcklbw       m17, m21
+    pmaddubsw       m16, m10, m16
+    pmaddubsw       m17, m10, m17
+    punpckhbw       m20, m21, m19
+    vpmovb2m         k2, m18
+    pmulhrsw        m16, m9
+    pmulhrsw        m17, m9
+    packsswb    m19{k1}, m16, m16
+    packsswb    m21{k1}, m17, m17
+    ; followed by v interpolation (top | cur -> cur)
+    punpcklbw       m21, m19
+    mova            m19, m0
+    pmaddubsw       m20, m12, m20
+    pmaddubsw       m21, m12, m21
+    punpcklbw       m16, m18, m5
+    punpckhbw       m17, m18, m5
+    vpermt2b        m19, m18, m1 ; scaling[  0..127]
+    vpermi2b        m18, m2, m3  ; scaling[128..255]
+    pmulhrsw        m20, m9
+    pmulhrsw        m21, m9
+    vmovdqu8    m19{k2}, m18     ; scaling[src]
+    pshufb          m19, m4
+    packsswb        m21, m20
+    punpcklbw       m20, m5, m21
+    punpckhbw       m21, m5
+    pmaddubsw       m18, m19, m20
+    pmaddubsw       m19, m21
+    add      grain_lutq, 82*2
+    pmulhrsw        m18, m6      ; noise
+    pmulhrsw        m19, m6
+    paddw           m16, m18
+    paddw           m17, m19
+    packuswb        m16, m17
+    pmaxub          m16, m7
+    pminub          m16, m8
+    mova    [dstq+srcq], ym16
+    add            srcq, strideq
+    vextracti32x8 [dstq+srcq], m16, 1
+    add            srcq, strideq
+    sub              hb, 2
+    jg .loop_y_h_overlap
+    add              wq, 32
+    lea            srcq, [src_bakq+wq]
+    jl .hv_overlap
+.end:
+    RET
+
+%macro FGUV_FN 3 ; name, ss_hor, ss_ver
+cglobal fguv_32x32xn_i%1_8bpc, 6, 14+%2, 22, dst, src, stride, fg_data, w, \
+                                             scaling, grain_lut, h, sby, luma, \
+                                             overlap, uv_pl, is_id, _, stride3
+    lea             r11, [fg_min]
+    mov             r6d, [fg_dataq+FGData.scaling_shift]
+    mov             r7d, [fg_dataq+FGData.clip_to_restricted_range]
+    mov             r9d, is_idm
+    mov            sbyd, sbym
+    mov        overlapd, [fg_dataq+FGData.overlap_flag]
+%if %2
+    mov             r12, 0x000f000f000f000f ; h_overlap mask
+    vpbroadcastq    m10, [base+pb_23_22_0_32]
+    lea        stride3q, [strideq*3]
+%else
+    mov             r12, 0x0000000f0000000f
+    vpbroadcastq    m10, [base+pb_27_17_17_27]
+%endif
+    mova             m0, [scalingq+64*0]
+    mova             m1, [scalingq+64*1]
+    mova             m2, [scalingq+64*2]
+    mova             m3, [scalingq+64*3]
+    kmovq            k1, r12
+    vbroadcasti32x4  m4, [base+interleave_hl]
+    vpbroadcastd     m6, [base+noise_rnd+r6*4-32]
+    vpbroadcastd     m7, [base+fg_min+r7*4]
+    shlx            r7d, r7d, r9d
+    vpbroadcastd     m8, [base+fg_max+r7*4]
+    test           sbyd, sbyd
+    setnz           r7b
+    vpbroadcastd     m9, [base+pw_1024]
+    mova            m11, [base+pb_even]
+    mova            m12, [base+pb_odd]
+    pxor             m5, m5
+    mov              r5, r10mp      ; lstride
+    cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
+    jne .csfl
+
+%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
+    DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \
+                h, sby, see, overlap, uv_pl, _, _, stride3
+%if %1
+    mov             r6d, uv_plm
+    vpbroadcastd    m16, [base+pw_8]
+    vbroadcasti32x4 m14, [fg_dataq+FGData.uv_mult+r6*4]
+    vpbroadcastw    m15, [fg_dataq+FGData.uv_offset+r6*4]
+    pshufb          m14, m16     ; uv_luma_mult, uv_mult
+%endif
+    test            r7b, overlapb
+    jnz %%v_overlap
+
+    imul           seed, sbyd, (173 << 24) | 37
+    add            seed, (105 << 24) | 178
+    rorx           seed, seed, 24
+    movzx          seed, seew
+    xor            seed, [fg_dataq+FGData.seed]
+
+    DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+                offx, offy, see, overlap, _, _, _, stride3
+
+    mov           lumaq, r9mp
+    lea             r11, [srcq+wq]
+    lea             r12, [dstq+wq]
+    lea             r13, [lumaq+wq*(1+%2)]
+    mov           r11mp, r11
+    mov           r12mp, r12
+    neg              wq
+
+%%loop_x:
+    rorx             r6, seeq, 1
+    or             seed, 0xeff4
+    test           seeb, seeh
+    lea            seed, [r6+0x8000]
+    cmovp          seed, r6d     ; updated seed
+    rorx          offyd, seed, 8
+    rorx          offxq, seeq, 12
+    and           offyd, 0xf
+    imul          offyd, 164>>%3
+    lea           offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
+
+    DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+                h, offxy, see, overlap, _, _, _, stride3
+
+    mov      grain_lutq, grain_lutmp
+    mov              hd, hm
+%%loop_y:
+    mova           ym18, [lumaq+lstrideq*(0<<%3)]
+    vinserti32x8    m18, [lumaq+lstrideq*(1<<%3)], 1
+    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
+%if %2
+    mova           ym20, [lumaq+lstrideq*(0<<%3)]
+    vinserti32x8    m20, [lumaq+lstrideq*(1<<%3)], 1
+    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
+    mova           xm17, [srcq+strideq*0]
+    movu           xm21, [grain_lutq+offxyq+82*0]
+    vinserti128    ym17, [srcq+strideq*1], 1
+    vinserti128    ym21, [grain_lutq+offxyq+82*1], 1
+    mova            m19, m11
+    vpermi2b        m19, m18, m20
+    vpermt2b        m18, m12, m20
+    vinserti32x4    m17, [srcq+strideq*2], 2
+    vinserti32x4    m21, [grain_lutq+offxyq+82*2], 2
+    pavgb           m18, m19
+    vinserti32x4    m17, [srcq+stride3q ], 3
+    vinserti32x4    m21, [grain_lutq+offxyq+82*3], 3
+%else
+    mova           ym17, [srcq+strideq*0]
+    vinserti32x8    m17, [srcq+strideq*1], 1
+    movu           ym21, [grain_lutq+offxyq+82*0]
+    vinserti32x8    m21, [grain_lutq+offxyq+82*1], 1
+%endif
+    lea            srcq, [srcq+strideq*(2<<%2)]
+%if %1
+    punpckhbw       m19, m18, m17
+    punpcklbw       m18, m17     ; { luma, chroma }
+    pmaddubsw       m19, m14
+    pmaddubsw       m18, m14
+    psraw           m19, 6
+    psraw           m18, 6
+    paddw           m19, m15
+    paddw           m18, m15
+    packuswb        m18, m19
+%endif
+    mova            m19, m0
+    vpmovb2m         k2, m18
+    vpermt2b        m19, m18, m1 ; scaling[  0..127]
+    vpermi2b        m18, m2, m3  ; scaling[128..255]
+    punpcklbw       m20, m5, m21 ; grain
+    punpckhbw       m21, m5
+    vmovdqu8    m19{k2}, m18     ; scaling[src]
+    pshufb          m19, m4
+    pmaddubsw       m18, m19, m20
+    pmaddubsw       m19, m21
+    add      grain_lutq, 82*2<<%2
+    pmulhrsw        m18, m6      ; noise
+    pmulhrsw        m19, m6
+    punpcklbw       m16, m17, m5 ; chroma
+    punpckhbw       m17, m5
+    paddw           m16, m18
+    paddw           m17, m19
+    packuswb        m16, m17
+    pmaxub          m16, m7
+    pminub          m16, m8
+%if %2
+    mova          [dstq+strideq*0], xm16
+    vextracti128  [dstq+strideq*1], ym16, 1
+    vextracti32x4 [dstq+strideq*2], m16, 2
+    vextracti32x4 [dstq+stride3q ], m16, 3
+%else
+    mova          [dstq+strideq*0], ym16
+    vextracti32x8 [dstq+strideq*1], m16, 1
+%endif
+    lea            dstq, [dstq+strideq*(2<<%2)]
+    sub              hb, 2<<%2
+    jg %%loop_y
+    add              wq, 32>>%2
+    jge %%end
+    mov            srcq, r11mp
+    mov            dstq, r12mp
+    lea           lumaq, [r13+wq*(1<<%2)]
+    add            srcq, wq
+    add            dstq, wq
+    test       overlapd, overlapd
+    jz %%loop_x
+    cmp       dword r8m, 0       ; sby
+    jne %%hv_overlap
+
+    ; horizontal overlap (without vertical overlap)
+%%loop_x_h_overlap:
+    rorx             r6, seeq, 1
+    or             seed, 0xeff4
+    test           seeb, seeh
+    lea            seed, [r6+0x8000]
+    cmovp          seed, r6d     ; updated seed
+
+    DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+                offx, offy, see, left_offxy, _, _, _, stride3
+
+    lea     left_offxyd, [offyq+(32>>%2)]         ; previous column's offy*stride+offx
+    rorx          offyd, seed, 8
+    rorx          offxq, seeq, 12
+    and           offyd, 0xf
+    imul          offyd, 164>>%3
+    lea           offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
+
+    DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+                h, offxy, see, left_offxy, _, _, _, stride3
+
+    mov      grain_lutq, grain_lutmp
+    mov              hd, hm
+%%loop_y_h_overlap:
+    ; src
+%if %2
+    mova           ym18, [lumaq+lstrideq*(0<<%3)]
+    vinserti32x8    m18, [lumaq+lstrideq*(1<<%3)], 1
+    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
+    mova           ym20, [lumaq+lstrideq*(0<<%3)]
+    vinserti32x8    m20, [lumaq+lstrideq*(1<<%3)], 1
+    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
+    mova           xm17, [srcq+strideq*0]
+    vinserti128    ym17, [srcq+strideq*1], 1
+    mova            m19, m11
+    vpermi2b        m19, m18, m20
+    vpermt2b        m18, m12, m20
+    vinserti32x4    m17, [srcq+strideq*2], 2
+    pavgb           m18, m19
+    vinserti32x4    m17, [srcq+stride3q ], 3
+%else
+    mova           ym18, [lumaq+lstrideq*0]
+    vinserti32x8    m18, [lumaq+lstrideq*1], 1
+    mova           ym17, [srcq+strideq*0]
+    vinserti32x8    m17, [srcq+strideq*1], 1
+    lea           lumaq, [lumaq+lstrideq*2]
+%endif
+    lea            srcq, [srcq+strideq*(2<<%2)]
+%if %1
+    punpckhbw       m19, m18, m17
+    punpcklbw       m18, m17     ; { luma, chroma }
+    pmaddubsw       m19, m14
+    pmaddubsw       m18, m14
+    psraw           m19, 6
+    psraw           m18, 6
+    paddw           m19, m15
+    paddw           m18, m15
+    packuswb        m18, m19
+%endif
+    mova            m19, m0
+    vpmovb2m         k2, m18
+    vpermt2b        m19, m18, m1 ; scaling[  0..127]
+    vpermi2b        m18, m2, m3  ; scaling[128..255]
+    vmovdqu8    m19{k2}, m18     ; scaling[src]
+%if %2
+    movu           xm20, [grain_lutq+offxyq     +82*0]
+    movd           xm18, [grain_lutq+left_offxyq+82*0]
+    vinserti32x4   ym20, [grain_lutq+offxyq     +82*1], 1
+    vinserti32x4   ym18, [grain_lutq+left_offxyq+82*1], 1
+    vinserti32x4    m20, [grain_lutq+offxyq     +82*2], 2
+    vinserti32x4    m18, [grain_lutq+left_offxyq+82*2], 2
+    vinserti32x4    m20, [grain_lutq+offxyq     +82*3], 3
+    vinserti32x4    m18, [grain_lutq+left_offxyq+82*3], 3
+%else
+    movu           ym20, [grain_lutq+offxyq     + 0]
+    movd           xm18, [grain_lutq+left_offxyq+ 0]
+    vinserti32x8    m20, [grain_lutq+offxyq     +82], 1
+    vinserti32x4    m18, [grain_lutq+left_offxyq+82], 2
+%endif
+    punpcklbw       m18, m20
+    pmaddubsw       m18, m10, m18
+    punpckhbw       m21, m20, m5
+    pshufb          m19, m4
+    pmulhrsw        m18, m9
+    vpacksswb   m20{k1}, m18, m18
+    punpcklbw       m20, m5, m20
+    pmaddubsw       m18, m19, m20
+    pmaddubsw       m19, m21
+    add      grain_lutq, 82*2<<%2
+    pmulhrsw        m18, m6      ; noise
+    pmulhrsw        m19, m6
+    punpcklbw       m16, m17, m5 ; chroma
+    punpckhbw       m17, m5
+    paddw           m16, m18
+    paddw           m17, m19
+    packuswb        m16, m17
+    pmaxub          m16, m7
+    pminub          m16, m8
+%if %2
+    mova          [dstq+strideq*0], xm16
+    vextracti128  [dstq+strideq*1], ym16, 1
+    vextracti32x4 [dstq+strideq*2], m16, 2
+    vextracti32x4 [dstq+stride3q ], m16, 3
+%else
+    mova          [dstq+strideq*0], ym16
+    vextracti32x8 [dstq+strideq*1], m16, 1
+%endif
+    lea            dstq, [dstq+strideq*(2<<%2)]
+    sub              hb, 2<<%2
+    jg %%loop_y_h_overlap
+    add              wq, 32>>%2
+    jge %%end
+    mov            srcq, r11mp
+    mov            dstq, r12mp
+    lea           lumaq, [r13+wq*(1<<%2)]
+    add            srcq, wq
+    add            dstq, wq
+    cmp       dword r8m, 0       ; sby
+    jne %%hv_overlap
+    jmp %%loop_x_h_overlap
+
+%%v_overlap:
+    DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \
+                _, sby, see, overlap, _, _, _, stride3
+
+    movzx          sbyd, sbyb
+    imul           seed, [fg_dataq+FGData.seed], 0x00010001
+    imul            r7d, sbyd, 173 * 0x00010001
+    imul           sbyd, 37 * 0x01000100
+    add             r7d, (105 << 16) | 188
+    add            sbyd, (178 << 24) | (141 << 8)
+    and             r7d, 0x00ff00ff
+    and            sbyd, 0xff00ff00
+    xor            seed, r7d
+    xor            seed, sbyd    ; (cur_seed << 16) | top_seed
+
+%if %3
+    vpbroadcastd    m13, [base+pb_23_22]
+    kxnorw           k3, k3, k3  ; v_overlap mask
+%elif %2
+    vbroadcasti32x8 m13, [base+pb_27_17]
+    kxnord           k3, k3, k3
+    pshufd          m13, m13, q0000 ; 8x27_17, 8x17_27
+%else
+    vpbroadcastd   ym16, [base+pb_27_17]
+    vpbroadcastd    m13, [base+pb_17_27]
+    vmovdqa64   m13{k1}, m16
+%endif
+
+    DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+                offx, offy, see, overlap, top_offxy, _, _, stride3
+
+    mov           lumaq, r9mp
+    lea             r11, [srcq+wq]
+    lea             r12, [dstq+wq]
+    lea             r13, [lumaq+wq*(1<<%2)]
+    mov           r11mp, r11
+    mov           r12mp, r12
+    neg              wq
+
+    ; we assume from the block above that bits 8-15 of r7d are zero'ed
+    mov             r6d, seed
+    or             seed, 0xeff4eff4
+    test           seeb, seeh
+    setp            r7b          ; parity of top_seed
+    shr            seed, 16
+    shl             r7d, 16
+    test           seeb, seeh
+    setp            r7b          ; parity of cur_seed
+    or              r6d, 0x00010001
+    xor             r7d, r6d
+    rorx           seed, r7d, 1  ; updated (cur_seed << 16) | top_seed
+    rorx          offyd, seed, 8
+    rorx          offxd, seed, 12
+    and           offyd, 0x000f000f
+    and           offxd, 0x000f000f
+    imul          offyd, 164>>%3
+    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+    lea           offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+    DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+                h, offxy, see, overlap, top_offxy, _, _, stride3
+
+    mov      grain_lutq, grain_lutmp
+    mov              hd, hm
+    movzx    top_offxyd, offxyw
+    shr          offxyd, 16
+
+%if %2
+    mova           ym18, [lumaq+lstrideq*(0<<%3)]
+    vinserti32x8    m18, [lumaq+lstrideq*(1<<%3)], 1
+    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
+    mova           ym20, [lumaq+lstrideq*(0<<%3)]
+    vinserti32x8    m20, [lumaq+lstrideq*(1<<%3)], 1
+    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
+    mova           xm17, [srcq+strideq*0]
+    vinserti128    ym17, [srcq+strideq*1], 1
+    mova            m19, m11
+    vpermi2b        m19, m18, m20
+    vpermt2b        m18, m12, m20
+    vinserti32x4    m17, [srcq+strideq*2], 2
+    pavgb           m18, m19
+    vinserti32x4    m17, [srcq+stride3q ], 3
+%else
+    mova           ym18, [lumaq+lstrideq*0]
+    vinserti32x8    m18, [lumaq+lstrideq*1], 1
+    mova           ym17, [srcq+strideq*0]
+    vinserti32x8    m17, [srcq+strideq*1], 1
+    lea           lumaq, [lumaq+lstrideq*2]
+%endif
+    lea            srcq, [srcq+strideq*(2<<%2)]
+%if %1
+    punpckhbw       m19, m18, m17
+    punpcklbw       m18, m17     ; { luma, chroma }
+    pmaddubsw       m19, m14
+    pmaddubsw       m18, m14
+    psraw           m19, 6
+    psraw           m18, 6
+    paddw           m19, m15
+    paddw           m18, m15
+    packuswb        m18, m19
+%endif
+    mova            m19, m0
+    vpmovb2m         k2, m18
+    vpermt2b        m19, m18, m1 ; scaling[  0..127]
+    vpermi2b        m18, m2, m3  ; scaling[128..255]
+%if %3
+    movu           xm21, [grain_lutq+offxyq+82*0]
+    movu           xm16, [grain_lutq+top_offxyq+82*0]
+    punpcklbw      xm20, xm16, xm21
+    punpckhbw      xm16, xm21
+    pmaddubsw      xm20, xm13, xm20
+    pmaddubsw      xm16, xm13, xm16
+    ; only interpolate first line, insert remaining line unmodified
+    vbroadcasti128 ym21, [grain_lutq+offxyq+82*1]
+    vinserti32x4    m21, [grain_lutq+offxyq+82*2], 2
+    vinserti32x4    m21, [grain_lutq+offxyq+82*3], 3
+    pmulhrsw       xm20, xm9
+    pmulhrsw       xm16, xm9
+    vpacksswb   m21{k3}, m20, m16
+%elif %2
+    movu           xm21, [grain_lutq+offxyq+82*0]
+    vinserti128    ym21, [grain_lutq+offxyq+82*1], 1
+    movu           xm16, [grain_lutq+top_offxyq+82*0]
+    vinserti32x4   ym16, [grain_lutq+top_offxyq+82*1], 1
+    punpcklbw      ym20, ym16, ym21
+    punpckhbw      ym16, ym21
+    pmaddubsw      ym20, ym13, ym20
+    pmaddubsw      ym16, ym13, ym16
+    vbroadcasti32x4 m21, [grain_lutq+offxyq+82*2]
+    vinserti32x4    m21, [grain_lutq+offxyq+82*3], 3
+    pmulhrsw       ym20, ym9
+    pmulhrsw       ym16, ym9
+    packsswb    m21{k3}, m20, m16
+%else
+    movu           ym16, [grain_lutq+offxyq+82*0]
+    vinserti32x8    m16, [grain_lutq+offxyq+82*1], 1
+    movu           ym20, [grain_lutq+top_offxyq+82*0]
+    vinserti32x8    m20, [grain_lutq+top_offxyq+82*1], 1
+    punpcklbw       m21, m20, m16
+    punpckhbw       m20, m16
+    pmaddubsw       m21, m13, m21
+    pmaddubsw       m20, m13, m20
+    pmulhrsw        m21, m9
+    pmulhrsw        m20, m9
+    packsswb        m21, m20
+%endif
+    vmovdqu8    m19{k2}, m18     ; scaling[src]
+    pshufb          m19, m4
+    punpcklbw       m20, m5, m21
+    punpckhbw       m21, m5
+    pmaddubsw       m18, m19, m20
+    pmaddubsw       m19, m21
+    add      grain_lutq, 82*2<<%2
+    pmulhrsw        m18, m6      ; noise
+    pmulhrsw        m19, m6
+    punpcklbw       m16, m17, m5 ; chroma
+    punpckhbw       m17, m5
+    paddw           m16, m18
+    paddw           m17, m19
+    packuswb        m16, m17
+    pmaxub          m16, m7
+    pminub          m16, m8
+%if %2
+    mova          [dstq+strideq*0], xm16
+    vextracti128  [dstq+strideq*1], ym16, 1
+    vextracti32x4 [dstq+strideq*2], m16, 2
+    vextracti32x4 [dstq+stride3q ], m16, 3
+%else
+    mova          [dstq+strideq*0], ym16
+    vextracti32x8 [dstq+strideq*1], m16, 1
+%endif
+    lea            dstq, [dstq+strideq*(2<<%2)]
+    sub              hb, 2<<%2
+    jg %%loop_y
+    add              wq, 32>>%2
+    jge %%end
+    mov            srcq, r11mp
+    mov            dstq, r12mp
+    lea           lumaq, [r13+wq*(1<<%2)]
+    add            srcq, wq
+    add            dstq, wq
+
+%%hv_overlap:
+    ; we assume from the block above that bits 8-15 of r7d are zero'ed
+    mov             r6d, seed
+    or             seed, 0xeff4eff4
+    test           seeb, seeh
+    setp            r7b          ; parity of top_seed
+    shr            seed, 16
+    shl             r7d, 16
+    test           seeb, seeh
+    setp            r7b          ; parity of cur_seed
+    or              r6d, 0x00010001
+    xor             r7d, r6d
+    rorx           seed, r7d, 1  ; updated (cur_seed << 16) | top_seed
+
+    DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+                offx, offy, see, left_offxy, top_offxy, topleft_offxy, _, stride3
+
+    lea  topleft_offxyd, [top_offxyq+(32>>%2)]
+    lea     left_offxyd, [offyq+(32>>%2)]
+    rorx          offyd, seed, 8
+    rorx          offxd, seed, 12
+    and           offyd, 0x000f000f
+    and           offxd, 0x000f000f
+    imul          offyd, 164>>%3
+    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+    lea           offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+    DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+                h, offxy, see, left_offxy, top_offxy, topleft_offxy, _, stride3
+
+    mov      grain_lutq, grain_lutmp
+    mov              hd, hm
+    movzx    top_offxyd, offxyw
+    shr          offxyd, 16
+
+%if %2
+    movu           xm21, [grain_lutq+offxyq+82*0]
+    movd           xm16, [grain_lutq+left_offxyq+82*0]
+    vinserti128    ym21, [grain_lutq+offxyq+82*1], 1
+    vinserti128    ym16, [grain_lutq+left_offxyq+82*1], 1
+    vinserti32x4    m21, [grain_lutq+offxyq+82*2], 2
+    vinserti32x4    m16, [grain_lutq+left_offxyq+82*2], 2
+    vinserti32x4    m21, [grain_lutq+offxyq+82*3], 3
+    vinserti32x4    m16, [grain_lutq+left_offxyq+82*3], 3
+    movd           xm18, [grain_lutq+topleft_offxyq+82*0]
+    movu           xm20, [grain_lutq+top_offxyq]
+    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+    punpcklbw       m16, m21
+%if %3
+    punpcklbw      xm18, xm20
+%else
+    vinserti128    ym18, [grain_lutq+topleft_offxyq+82*1], 1
+    vinserti128    ym20, [grain_lutq+top_offxyq+82*1], 1
+    punpcklbw      ym18, ym20
+%endif
+    punpcklqdq      m16, m18
+    pmaddubsw       m16, m10, m16
+    pmulhrsw        m16, m9
+    packsswb        m16, m16
+%if %3
+    vpalignr   xm20{k1}, xm16, xm16, 4
+%else
+    vpalignr   ym20{k1}, ym16, ym16, 4
+%endif
+    vmovdqu8    m21{k1}, m16
+%else
+    movu           ym21, [grain_lutq+offxyq+82*0]
+    vinserti32x8    m21, [grain_lutq+offxyq+82*1], 1
+    movd           xm16, [grain_lutq+left_offxyq+82*0]
+    vinserti32x4    m16, [grain_lutq+left_offxyq+82*1], 2
+    movu           ym20, [grain_lutq+top_offxyq+82*0]
+    vinserti32x8    m20, [grain_lutq+top_offxyq+82*1], 1
+    movd           xm18, [grain_lutq+topleft_offxyq+82*0]
+    vinserti32x4    m18, [grain_lutq+topleft_offxyq+82*1], 2
+    punpcklbw       m16, m21
+    punpcklbw       m18, m20
+    punpcklqdq      m16, m18
+    pmaddubsw       m16, m10, m16
+    pmulhrsw        m16, m9
+    packsswb        m16, m16
+    vpalignr    m20{k1}, m16, m16, 4
+    vmovdqu8    m21{k1}, m16
+%endif
+%if %2
+    mova           ym18, [lumaq+lstrideq*(0<<%3)]
+    vinserti32x8    m18, [lumaq+lstrideq*(1<<%3)], 1
+    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
+    mova           ym16, [lumaq+lstrideq*(0<<%3)]
+    vinserti32x8    m16, [lumaq+lstrideq*(1<<%3)], 1
+    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
+    mova           xm17, [srcq+strideq*0]
+    vinserti128    ym17, [srcq+strideq*1], 1
+    mova            m19, m11
+    vpermi2b        m19, m18, m16
+    vpermt2b        m18, m12, m16
+    vinserti32x4    m17, [srcq+strideq*2], 2
+    pavgb           m18, m19
+    vinserti32x4    m17, [srcq+stride3q ], 3
+%else
+    mova           ym18, [lumaq+lstrideq*0]
+    vinserti32x8    m18, [lumaq+lstrideq*1], 1
+    mova           ym17, [srcq+strideq*0]
+    vinserti32x8    m17, [srcq+strideq*1], 1
+    lea           lumaq, [lumaq+lstrideq*2]
+%endif
+    lea            srcq, [srcq+strideq*(2<<%2)]
+%if %1
+    punpckhbw       m19, m18, m17
+    punpcklbw       m18, m17     ; { luma, chroma }
+    pmaddubsw       m19, m14
+    pmaddubsw       m18, m14
+    psraw           m19, 6
+    psraw           m18, 6
+    paddw           m19, m15
+    paddw           m18, m15
+    packuswb        m18, m19
+%endif
+    mova            m19, m0
+    vpmovb2m         k2, m18
+    vpermt2b        m19, m18, m1 ; scaling[  0..127]
+    vpermi2b        m18, m2, m3  ; scaling[128..255]
+    ; followed by v interpolation (top | cur -> cur)
+%if %3
+    punpcklbw      xm16, xm20, xm21
+    punpckhbw      xm20, xm21
+    pmaddubsw      xm16, xm13, xm16
+    pmaddubsw      xm20, xm13, xm20
+    pmulhrsw       xm16, xm9
+    pmulhrsw       xm20, xm9
+    vpacksswb   m21{k3}, m16, m20
+%elif %2
+    punpcklbw      ym16, ym20, ym21
+    punpckhbw      ym20, ym21
+    pmaddubsw      ym16, ym13, ym16
+    pmaddubsw      ym20, ym13, ym20
+    pmulhrsw       ym16, ym9
+    pmulhrsw       ym20, ym9
+    vpacksswb   m21{k3}, m16, m20
+%else
+    punpcklbw       m16, m20, m21
+    punpckhbw       m20, m21
+    pmaddubsw       m16, m13, m16
+    pmaddubsw       m20, m13, m20
+    pmulhrsw        m16, m9
+    pmulhrsw        m20, m9
+    packsswb        m21, m16, m20
+%endif
+    vmovdqu8    m19{k2}, m18     ; scaling[src]
+    pshufb          m19, m4
+    punpcklbw       m20, m5, m21
+    punpckhbw       m21, m5
+    pmaddubsw       m18, m19, m20
+    pmaddubsw       m19, m21
+    add      grain_lutq, 82*2<<%2
+    pmulhrsw        m18, m6      ; grain
+    pmulhrsw        m19, m6
+    punpcklbw       m16, m17, m5 ; chroma
+    punpckhbw       m17, m5
+    paddw           m16, m18
+    paddw           m17, m19
+    packuswb        m16, m17
+    pmaxub          m16, m7
+    pminub          m16, m8
+%if %2
+    mova          [dstq+strideq*0], xm16
+    vextracti128  [dstq+strideq*1], ym16, 1
+    vextracti32x4 [dstq+strideq*2], m16, 2
+    vextracti32x4 [dstq+stride3q ], m16, 3
+%else
+    mova          [dstq+strideq*0], ym16
+    vextracti32x8 [dstq+strideq*1], m16, 1
+%endif
+    lea            dstq, [dstq+strideq*(2<<%2)]
+    sub              hb, 2<<%2
+    jg %%loop_y_h_overlap
+    add              wq, 32>>%2
+    jge %%end
+    mov            srcq, r11mp
+    mov            dstq, r12mp
+    lea           lumaq, [r13+wq*(1<<%2)]
+    add            srcq, wq
+    add            dstq, wq
+    jmp %%hv_overlap
+%%end:
+    RET
+%endmacro
+
+    %%FGUV_32x32xN_LOOP 1, %2, %3
+.csfl:
+    %%FGUV_32x32xN_LOOP 0, %2, %3
+%endmacro
+
+FGUV_FN 420, 1, 1
+FGUV_FN 422, 1, 0
+FGUV_FN 444, 0, 0
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/filmgrain_common.asm b/third_party/dav1d/src/x86/filmgrain_common.asm
new file mode 100644
index 0000000000000..74f7044e666cb
--- /dev/null
+++ b/third_party/dav1d/src/x86/filmgrain_common.asm
@@ -0,0 +1,46 @@
+; Copyright © 2019-2022, VideoLAN and dav1d authors
+; Copyright © 2019-2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+struc FGData
+    .seed:                      resd 1
+    .num_y_points:              resd 1
+    .y_points:                  resb 14 * 2
+    .chroma_scaling_from_luma:  resd 1
+    .num_uv_points:             resd 2
+    .uv_points:                 resb 2 * 10 * 2
+    .scaling_shift:             resd 1
+    .ar_coeff_lag:              resd 1
+    .ar_coeffs_y:               resb 24
+    .ar_coeffs_uv:              resb 2 * 28 ; includes padding
+    .ar_coeff_shift:            resq 1
+    .grain_scale_shift:         resd 1
+    .uv_mult:                   resd 2
+    .uv_luma_mult:              resd 2
+    .uv_offset:                 resd 2
+    .overlap_flag:              resd 1
+    .clip_to_restricted_range:  resd 1
+endstruc
+
+cextern gaussian_sequence
diff --git a/third_party/dav1d/src/x86/film_grain_init_tmpl.c b/third_party/dav1d/src/x86/filmgrain_init_tmpl.c
similarity index 67%
rename from third_party/dav1d/src/x86/film_grain_init_tmpl.c
rename to third_party/dav1d/src/x86/filmgrain_init_tmpl.c
index 606ea3cb56cff..0b783d10d3c45 100644
--- a/third_party/dav1d/src/x86/film_grain_init_tmpl.c
+++ b/third_party/dav1d/src/x86/filmgrain_init_tmpl.c
@@ -1,6 +1,6 @@
 /*
- * Copyright © 2018-2021, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
+ * Copyright © 2018-2022, VideoLAN and dav1d authors
+ * Copyright © 2018-2022, Two Orioles, LLC
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -26,25 +26,21 @@
  */
 
 #include "src/cpu.h"
-#include "src/film_grain.h"
+#include "src/filmgrain.h"
 
-decl_generate_grain_y_fn(BF(dav1d_generate_grain_y, ssse3));
-decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_420, ssse3));
-decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_422, ssse3));
-decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_444, ssse3));
-decl_fgy_32x32xn_fn(BF(dav1d_fgy_32x32xn, ssse3));
-decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i420, ssse3));
-decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i422, ssse3));
-decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i444, ssse3));
+#define decl_fg_fns(ext)                                         \
+decl_generate_grain_y_fn(BF(dav1d_generate_grain_y, ext));       \
+decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_420, ext)); \
+decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_422, ext)); \
+decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_444, ext)); \
+decl_fgy_32x32xn_fn(BF(dav1d_fgy_32x32xn, ext));                 \
+decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i420, ext));          \
+decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i422, ext));          \
+decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i444, ext))
 
-decl_generate_grain_y_fn(BF(dav1d_generate_grain_y, avx2));
-decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_420, avx2));
-decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_422, avx2));
-decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_444, avx2));
-decl_fgy_32x32xn_fn(BF(dav1d_fgy_32x32xn, avx2));
-decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i420, avx2));
-decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i422, avx2));
-decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i444, avx2));
+decl_fg_fns(ssse3);
+decl_fg_fns(avx2);
+decl_fg_fns(avx512icl);
 
 COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c) {
     const unsigned flags = dav1d_get_cpu_flags();
@@ -68,11 +64,20 @@ COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c
     c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, avx2);
     c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, avx2);
 
-    if (flags & DAV1D_X86_CPU_FLAG_SLOW_GATHER) return;
+    if (!(flags & DAV1D_X86_CPU_FLAG_SLOW_GATHER)) {
+        c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx2);
+        c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx2);
+        c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx2);
+        c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx2);
+    }
 
-    c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx2);
-    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx2);
-    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx2);
-    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx2);
+#if BITDEPTH == 8
+    if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+    c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx512icl);
+    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx512icl);
+    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx512icl);
+    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx512icl);
+#endif
 #endif
 }
diff --git a/third_party/dav1d/src/x86/film_grain_sse.asm b/third_party/dav1d/src/x86/filmgrain_sse.asm
similarity index 99%
rename from third_party/dav1d/src/x86/film_grain_sse.asm
rename to third_party/dav1d/src/x86/filmgrain_sse.asm
index 20334591a93db..0172f987607a7 100644
--- a/third_party/dav1d/src/x86/film_grain_sse.asm
+++ b/third_party/dav1d/src/x86/filmgrain_sse.asm
@@ -25,6 +25,7 @@
 
 %include "config.asm"
 %include "ext/x86/x86inc.asm"
+%include "x86/filmgrain_common.asm"
 
 SECTION_RODATA
 
@@ -66,38 +67,8 @@ JMP_TABLE generate_grain_uv_420, ssse3, 0, 1, 2, 3
 JMP_TABLE generate_grain_uv_422, ssse3, 0, 1, 2, 3
 JMP_TABLE generate_grain_uv_444, ssse3, 0, 1, 2, 3
 
-struc FGData
-    .seed:                      resd 1
-    .num_y_points:              resd 1
-    .y_points:                  resb 14 * 2
-    .chroma_scaling_from_luma:  resd 1
-    .num_uv_points:             resd 2
-    .uv_points:                 resb 2 * 10 * 2
-    .scaling_shift:             resd 1
-    .ar_coeff_lag:              resd 1
-    .ar_coeffs_y:               resb 24
-    .ar_coeffs_uv:              resb 2 * 28 ; includes padding
-    .ar_coeff_shift:            resq 1
-    .grain_scale_shift:         resd 1
-    .uv_mult:                   resd 2
-    .uv_luma_mult:              resd 2
-    .uv_offset:                 resd 2
-    .overlap_flag:              resd 1
-    .clip_to_restricted_range:  resd 1
-endstruc
-
-cextern gaussian_sequence
-
 SECTION .text
 
-%macro REPX 2-*
-    %xdefine %%f(x) %1
-%rep %0 - 1
-    %rotate 1
-    %%f(%1)
-%endrep
-%endmacro
-
 %if ARCH_X86_32
 %define PIC_ptr(a) base+a
 %else
diff --git a/third_party/dav1d/src/x86/ipred16_avx2.asm b/third_party/dav1d/src/x86/ipred16_avx2.asm
index e6d4faddee8ae..72300c2a4cd07 100644
--- a/third_party/dav1d/src/x86/ipred16_avx2.asm
+++ b/third_party/dav1d/src/x86/ipred16_avx2.asm
@@ -26,7 +26,7 @@
 %include "config.asm"
 %include "ext/x86/x86inc.asm"
 
-SECTION_RODATA 32
+SECTION_RODATA 64
 
 %macro SMOOTH_WEIGHTS 1-*
 const smooth_weights_1d_16bpc ; sm_weights[] << 7
@@ -134,14 +134,6 @@ cextern filter_intra_taps
 
 SECTION .text
 
-%macro REPX 2-*
-    %xdefine %%f(x) %1
-%rep %0 - 1
-    %rotate 1
-    %%f(%1)
-%endrep
-%endmacro
-
 INIT_YMM avx2
 cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h
     movifnidn            hd, hm
diff --git a/third_party/dav1d/src/x86/ipred16_avx512.asm b/third_party/dav1d/src/x86/ipred16_avx512.asm
new file mode 100644
index 0000000000000..4a1b060bd5f67
--- /dev/null
+++ b/third_party/dav1d/src/x86/ipred16_avx512.asm
@@ -0,0 +1,833 @@
+; Copyright © 2022, VideoLAN and dav1d authors
+; Copyright © 2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+ipred_shuf:    db 14, 15, 14, 15,  0,  1,  2,  3,  6,  7,  6,  7,  0,  1,  2,  3
+               db 10, 11, 10, 11,  8,  9, 10, 11,  2,  3,  2,  3,  8,  9, 10, 11
+               db 12, 13, 12, 13,  4,  5,  6,  7,  4,  5,  4,  5,  4,  5,  6,  7
+               db  8,  9,  8,  9, 12, 13, 14, 15,  0,  1,  0,  1, 12, 13, 14, 15
+smooth_perm:   db  1,  2,  5,  6,  9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
+               db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62
+               db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94
+               db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126
+pal_pred_perm: db  0, 32,  1, 33,  2, 34,  3, 35,  4, 36,  5, 37,  6, 38,  7, 39
+               db  8, 40,  9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47
+               db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55
+               db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
+filter_permA:  times 4 db  6,  7,  8,  9, 14, 15,  4,  5
+               times 4 db 10, 11, 12, 13,  2,  3, -1, -1
+filter_permB:  times 4 db 22, 23, 24, 25, 30, 31,  6,  7
+               times 4 db 26, 27, 28, 29, 14, 15, -1, -1
+filter_permC:          dd  8 ; dq  8, 10,  1, 11,  0,  9
+pw_1:          times 2 dw  1
+                       dd 10
+filter_rnd:            dd 32
+                       dd  1
+                       dd  8
+                       dd 11
+filter_shift:  times 2 dw  6
+                       dd  0
+               times 2 dw  4
+                       dd  9
+
+%macro JMP_TABLE 3-*
+    %xdefine %1_%2_table (%%table - 2*4)
+    %xdefine %%base mangle(private_prefix %+ _%1_%2)
+    %%table:
+    %rep %0 - 2
+        dd %%base %+ .%3 - (%%table - 2*4)
+        %rotate 1
+    %endrep
+%endmacro
+
+JMP_TABLE ipred_paeth_16bpc,      avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_16bpc,     avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_h_16bpc,   avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_v_16bpc,   avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE pal_pred_16bpc,         avx512icl, w4, w8, w16, w32, w64
+
+cextern smooth_weights_1d_16bpc
+cextern smooth_weights_2d_16bpc
+cextern filter_intra_taps
+
+SECTION .text
+
+%macro PAETH 3 ; top, signed_ldiff, ldiff
+    paddw               m0, m%2, m2
+    psubw               m1, m0, m3  ; tldiff
+    psubw               m0, m%1     ; tdiff
+    pabsw               m1, m1
+    pabsw               m0, m0
+    pcmpgtw             k1, m0, m1
+    pminsw              m0, m1
+    pcmpgtw             k2, m%3, m0
+    vpblendmw       m0{k1}, m%1, m3
+    vpblendmw       m0{k2}, m2, m0
+%endmacro
+
+INIT_ZMM avx512icl
+cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl, w, h
+%define base r6-ipred_paeth_16bpc_avx512icl_table
+    lea                 r6, [ipred_paeth_16bpc_avx512icl_table]
+    tzcnt               wd, wm
+    movifnidn           hd, hm
+    movsxd              wq, [r6+wq*4]
+    vpbroadcastw        m3, [tlq]   ; topleft
+    add                 wq, r6
+    jmp                 wq
+.w4:
+    vpbroadcastq        m4, [tlq+2] ; top
+    movsldup            m7, [base+ipred_shuf]
+    lea                 r6, [strideq*3]
+    psubw               m5, m4, m3
+    pabsw               m6, m5
+.w4_loop:
+    sub                tlq, 16
+    vbroadcasti32x4     m2, [tlq]
+    pshufb              m2, m7      ; left
+    PAETH                4, 5, 6
+    vextracti32x4     xmm1, m0, 2
+    vextracti32x4     xmm2, ym0, 1
+    vextracti32x4     xmm3, m0, 3
+    movq   [dstq+strideq*0], xm0
+    movq   [dstq+strideq*1], xmm1
+    movq   [dstq+strideq*2], xmm2
+    movq   [dstq+r6       ], xmm3
+    sub                 hd, 8
+    jl .w4_end
+    lea               dstq, [dstq+strideq*4]
+    movhps [dstq+strideq*0], xm0
+    movhps [dstq+strideq*1], xmm1
+    movhps [dstq+strideq*2], xmm2
+    movhps [dstq+r6       ], xmm3
+    lea               dstq, [dstq+strideq*4]
+    jg .w4_loop
+.w4_end:
+    RET
+.w8:
+    vbroadcasti32x4     m4, [tlq+2]
+    movsldup            m7, [base+ipred_shuf]
+    lea                 r6, [strideq*3]
+    psubw               m5, m4, m3
+    pabsw               m6, m5
+.w8_loop:
+    sub                tlq, 8
+    vpbroadcastq        m2, [tlq]
+    pshufb              m2, m7
+    PAETH                4, 5, 6
+    mova          [dstq+strideq*0], xm0
+    vextracti32x4 [dstq+strideq*1], m0, 2
+    vextracti32x4 [dstq+strideq*2], ym0, 1
+    vextracti32x4 [dstq+r6       ], m0, 3
+    lea               dstq, [dstq+strideq*4]
+    sub                 hd, 4
+    jg .w8_loop
+    RET
+.w16:
+    vbroadcasti32x8     m4, [tlq+2]
+    movsldup            m7, [base+ipred_shuf]
+    psubw               m5, m4, m3
+    pabsw               m6, m5
+.w16_loop:
+    sub                tlq, 4
+    vpbroadcastd        m2, [tlq]
+    pshufb              m2, m7
+    PAETH                4, 5, 6
+    mova          [dstq+strideq*0], ym0
+    vextracti32x8 [dstq+strideq*1], m0, 1
+    lea               dstq, [dstq+strideq*2]
+    sub                 hd, 2
+    jg .w16_loop
+    RET
+.w32:
+    movu                m4, [tlq+2]
+    psubw               m5, m4, m3
+    pabsw               m6, m5
+.w32_loop:
+    sub                tlq, 2
+    vpbroadcastw        m2, [tlq]
+    PAETH                4, 5, 6
+    mova            [dstq], m0
+    add               dstq, strideq
+    dec                 hd
+    jg .w32_loop
+    RET
+.w64:
+    movu                m4, [tlq+ 2]
+    movu                m7, [tlq+66]
+    psubw               m5, m4, m3
+    psubw               m8, m7, m3
+    pabsw               m6, m5
+    pabsw               m9, m8
+.w64_loop:
+    sub                tlq, 2
+    vpbroadcastw        m2, [tlq]
+    PAETH                4, 5, 6
+    mova       [dstq+64*0], m0
+    PAETH                7, 8, 9
+    mova       [dstq+64*1], m0
+    add               dstq, strideq
+    dec                 hd
+    jg .w64_loop
+    RET
+
+cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3
+%define base r6-$$
+    lea                  r6, [$$]
+    tzcnt                wd, wm
+    mov                  hd, hm
+    movsxd               wq, [base+ipred_smooth_v_16bpc_avx512icl_table+wq*4]
+    lea            weightsq, [base+smooth_weights_1d_16bpc+hq*4]
+    neg                  hq
+    vpbroadcastw         m6, [tlq+hq*2] ; bottom
+    lea                  wq, [base+ipred_smooth_v_16bpc_avx512icl_table+wq]
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+.w4:
+    vpbroadcastq         m5, [tlq+2]    ; top
+    movsldup             m4, [ipred_shuf]
+    psubw                m5, m6         ; top - bottom
+.w4_loop:
+    vbroadcasti32x4      m3, [weightsq+hq*2]
+    pshufb               m3, m4
+    pmulhrsw             m3, m5
+    paddw                m3, m6
+    vextracti32x4      xmm0, m3, 3
+    vextracti32x4      xmm1, ym3, 1
+    vextracti32x4      xmm2, m3, 2
+    movhps [dstq+strideq*0], xmm0
+    movhps [dstq+strideq*1], xmm1
+    movhps [dstq+strideq*2], xmm2
+    movhps [dstq+stride3q ], xm3
+    add                  hq, 8
+    jg .end
+    lea                dstq, [dstq+strideq*4]
+    movq   [dstq+strideq*0], xmm0
+    movq   [dstq+strideq*1], xmm1
+    movq   [dstq+strideq*2], xmm2
+    movq   [dstq+stride3q ], xm3
+    lea                dstq, [dstq+strideq*4]
+    jl .w4_loop
+.end:
+    RET
+.w8:
+    vbroadcasti32x4      m5, [tlq+2]    ; top
+    movsldup             m4, [ipred_shuf]
+    psubw                m5, m6         ; top - bottom
+.w8_loop:
+    vpbroadcastq         m0, [weightsq+hq*2]
+    pshufb               m0, m4
+    pmulhrsw             m0, m5
+    paddw                m0, m6
+    vextracti32x4 [dstq+strideq*0], m0, 3
+    vextracti32x4 [dstq+strideq*1], ym0, 1
+    vextracti32x4 [dstq+strideq*2], m0, 2
+    mova          [dstq+stride3q ], xm0
+    lea                dstq, [dstq+strideq*4]
+    add                  hq, 4
+    jl .w8_loop
+    RET
+.w16:
+    vbroadcasti32x8      m5, [tlq+2]    ; top
+    movsldup             m4, [ipred_shuf]
+    psubw                m5, m6         ; top - bottom
+.w16_loop:
+    vpbroadcastd         m0, [weightsq+hq*2+0]
+    vpbroadcastd         m1, [weightsq+hq*2+4]
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pmulhrsw             m0, m5
+    pmulhrsw             m1, m5
+    paddw                m0, m6
+    paddw                m1, m6
+    vextracti32x8 [dstq+strideq*0], m0, 1
+    mova          [dstq+strideq*1], ym0
+    vextracti32x8 [dstq+strideq*2], m1, 1
+    mova          [dstq+stride3q ], ym1
+    lea                dstq, [dstq+strideq*4]
+    add                  hq, 4
+    jl .w16_loop
+    RET
+.w32:
+    movu                 m5, [tlq+2]
+    psubw                m5, m6
+.w32_loop:
+    vpbroadcastw         m0, [weightsq+hq*2+0]
+    vpbroadcastw         m1, [weightsq+hq*2+2]
+    vpbroadcastw         m2, [weightsq+hq*2+4]
+    vpbroadcastw         m3, [weightsq+hq*2+6]
+    REPX   {pmulhrsw x, m5}, m0, m1, m2, m3
+    REPX   {paddw    x, m6}, m0, m1, m2, m3
+    mova   [dstq+strideq*0], m0
+    mova   [dstq+strideq*1], m1
+    mova   [dstq+strideq*2], m2
+    mova   [dstq+stride3q ], m3
+    lea                dstq, [dstq+strideq*4]
+    add                  hq, 4
+    jl .w32_loop
+    RET
+.w64:
+    movu                 m4, [tlq+ 2]
+    movu                 m5, [tlq+66]
+    psubw                m4, m6
+    psubw                m5, m6
+.w64_loop:
+    vpbroadcastw         m1, [weightsq+hq*2+0]
+    vpbroadcastw         m3, [weightsq+hq*2+2]
+    pmulhrsw             m0, m4, m1
+    pmulhrsw             m1, m5
+    pmulhrsw             m2, m4, m3
+    pmulhrsw             m3, m5
+    REPX      {paddw x, m6}, m0, m1, m2, m3
+    mova [dstq+strideq*0+64*0], m0
+    mova [dstq+strideq*0+64*1], m1
+    mova [dstq+strideq*1+64*0], m2
+    mova [dstq+strideq*1+64*1], m3
+    lea                dstq, [dstq+strideq*2]
+    add                  hq, 2
+    jl .w64_loop
+    RET
+
+cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl, w, h, stride3
+    lea                  r6, [$$]
+    mov                  wd, wm
+    movifnidn            hd, hm
+    vpbroadcastw         m6, [tlq+wq*2] ; right
+    tzcnt                wd, wd
+    add                  hd, hd
+    movsxd               wq, [base+ipred_smooth_h_16bpc_avx512icl_table+wq*4]
+    sub                 tlq, hq
+    lea            stride3q, [strideq*3]
+    lea                  wq, [base+ipred_smooth_h_16bpc_avx512icl_table+wq]
+    jmp                  wq
+.w4:
+    movsldup             m4, [base+ipred_shuf]
+    vpbroadcastq         m5, [base+smooth_weights_1d_16bpc+4*2]
+.w4_loop:
+    vbroadcasti32x4      m0, [tlq+hq-16] ; left
+    pshufb               m0, m4
+    psubw                m0, m6          ; left - right
+    pmulhrsw             m0, m5
+    paddw                m0, m6
+    vextracti32x4      xmm1, m0, 2
+    vextracti32x4      xmm2, ym0, 1
+    vextracti32x4      xmm3, m0, 3
+    movq   [dstq+strideq*0], xm0
+    movq   [dstq+strideq*1], xmm1
+    movq   [dstq+strideq*2], xmm2
+    movq   [dstq+stride3q ], xmm3
+    sub                  hd, 8*2
+    jl .end
+    lea                dstq, [dstq+strideq*4]
+    movhps [dstq+strideq*0], xm0
+    movhps [dstq+strideq*1], xmm1
+    movhps [dstq+strideq*2], xmm2
+    movhps [dstq+stride3q ], xmm3
+    lea                dstq, [dstq+strideq*4]
+    jg .w4_loop
+.end:
+    RET
+.w8:
+    movsldup             m4, [base+ipred_shuf]
+    vbroadcasti32x4      m5, [base+smooth_weights_1d_16bpc+8*2]
+.w8_loop:
+    vpbroadcastq         m0, [tlq+hq-8] ; left
+    pshufb               m0, m4
+    psubw                m0, m6         ; left - right
+    pmulhrsw             m0, m5
+    paddw                m0, m6
+    mova          [dstq+strideq*0], xm0
+    vextracti32x4 [dstq+strideq*1], m0, 2
+    vextracti32x4 [dstq+strideq*2], ym0, 1
+    vextracti32x4 [dstq+stride3q ], m0, 3
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4*2
+    jg .w8_loop
+    RET
+.w16:
+    movsldup             m4, [base+ipred_shuf]
+    vbroadcasti32x8      m5, [base+smooth_weights_1d_16bpc+16*2]
+.w16_loop:
+    vpbroadcastd         m0, [tlq+hq-4]
+    vpbroadcastd         m1, [tlq+hq-8]
+    pshufb               m0, m4
+    pshufb               m1, m4
+    psubw                m0, m6
+    psubw                m1, m6
+    pmulhrsw             m0, m5
+    pmulhrsw             m1, m5
+    paddw                m0, m6
+    paddw                m1, m6
+    mova          [dstq+strideq*0], ym0
+    vextracti32x8 [dstq+strideq*1], m0, 1
+    mova          [dstq+strideq*2], ym1
+    vextracti32x8 [dstq+stride3q ], m1, 1
+    lea                dstq, [dstq+strideq*4]
+    sub                  hq, 4*2
+    jg .w16_loop
+    RET
+.w32:
+    movu                 m5, [base+smooth_weights_1d_16bpc+32*2]
+.w32_loop:
+    vpbroadcastq         m3, [tlq+hq-8]
+    punpcklwd            m3, m3
+    psubw                m3, m6
+    pshufd               m0, m3, q3333
+    pshufd               m1, m3, q2222
+    pshufd               m2, m3, q1111
+    pshufd               m3, m3, q0000
+    REPX   {pmulhrsw x, m5}, m0, m1, m2, m3
+    REPX   {paddw    x, m6}, m0, m1, m2, m3
+    mova   [dstq+strideq*0], m0
+    mova   [dstq+strideq*1], m1
+    mova   [dstq+strideq*2], m2
+    mova   [dstq+stride3q ], m3
+    lea                dstq, [dstq+strideq*4]
+    sub                  hq, 4*2
+    jg .w32_loop
+    RET
+.w64:
+    movu                 m4, [base+smooth_weights_1d_16bpc+64*2]
+    movu                 m5, [base+smooth_weights_1d_16bpc+64*3]
+.w64_loop:
+    vpbroadcastw         m1, [tlq+hq-2]
+    vpbroadcastw         m3, [tlq+hq-4]
+    psubw                m1, m6
+    psubw                m3, m6
+    pmulhrsw             m0, m4, m1
+    pmulhrsw             m1, m5
+    pmulhrsw             m2, m4, m3
+    pmulhrsw             m3, m5
+    REPX      {paddw x, m6}, m0, m1, m2, m3
+    mova [dstq+strideq*0+64*0], m0
+    mova [dstq+strideq*0+64*1], m1
+    mova [dstq+strideq*1+64*0], m2
+    mova [dstq+strideq*1+64*1], m3
+    lea                dstq, [dstq+strideq*2]
+    sub                  hq, 2*2
+    jg .w64_loop
+    RET
+
+cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl, w, h, v_weights, stride3
+    lea                 r6, [$$]
+    mov                 wd, wm
+    movifnidn           hd, hm
+    vpbroadcastw       m13, [tlq+wq*2]   ; right
+    tzcnt               wd, wd
+    add                 hd, hd
+    movsxd              wq, [base+ipred_smooth_16bpc_avx512icl_table+wq*4]
+    mov                r5d, 0x55555555
+    sub                tlq, hq
+    mova               m14, [base+smooth_perm]
+    kmovd               k1, r5d
+    vpbroadcastw        m0, [tlq]        ; bottom
+    mov                 r5, 0x3333333333333333
+    pxor               m15, m15
+    lea                 wq, [base+ipred_smooth_16bpc_avx512icl_table+wq]
+    kmovq               k2, r5
+    lea         v_weightsq, [base+smooth_weights_2d_16bpc+hq*2]
+    jmp                 wq
+.w4:
+    vpbroadcastq        m5, [tlq+hq+2]
+    movshdup            m3, [base+ipred_shuf]
+    movsldup            m4, [base+ipred_shuf]
+    vbroadcasti32x4     m6, [base+smooth_weights_2d_16bpc+4*4]
+    lea           stride3q, [strideq*3]
+    punpcklwd           m5, m0           ; top, bottom
+.w4_loop:
+    vbroadcasti32x4     m0, [v_weightsq]
+    vpbroadcastq        m2, [tlq+hq-8]
+    mova                m1, m13
+    pshufb              m0, m3
+    pmaddwd             m0, m5
+    pshufb          m1{k2}, m2, m4       ; left, right
+    vpdpwssd            m0, m1, m6
+    vpermb              m0, m14, m0
+    pavgw              ym0, ym15
+    vextracti32x4     xmm1, ym0, 1
+    movq   [dstq+strideq*0], xm0
+    movq   [dstq+strideq*1], xmm1
+    movhps [dstq+strideq*2], xm0
+    movhps [dstq+stride3q ], xmm1
+    lea               dstq, [dstq+strideq*4]
+    add         v_weightsq, 4*4
+    sub                 hd, 4*2
+    jg .w4_loop
+    RET
+.w8:
+    vbroadcasti32x4    ym5, [tlq+hq+2]
+    movshdup            m6, [base+ipred_shuf]
+    movsldup            m7, [base+ipred_shuf]
+    pmovzxwd            m5, ym5
+    vbroadcasti32x8     m8, [base+smooth_weights_2d_16bpc+8*4]
+    lea           stride3q, [strideq*3]
+    vpblendmw       m5{k1}, m0, m5       ; top, bottom
+.w8_loop:
+    vpbroadcastq        m0, [v_weightsq+0]
+    vpbroadcastq        m1, [v_weightsq+8]
+    vpbroadcastd        m3, [tlq+hq-4]
+    vpbroadcastd        m4, [tlq+hq-8]
+    pshufb              m0, m6
+    pmaddwd             m0, m5
+    pshufb              m1, m6
+    pmaddwd             m1, m5
+    mova                m2, m13
+    pshufb          m2{k2}, m3, m7       ; left, right
+    mova                m3, m13
+    pshufb          m3{k2}, m4, m7
+    vpdpwssd            m0, m2, m8
+    vpdpwssd            m1, m3, m8
+    add         v_weightsq, 4*4
+    vpermt2b            m0, m14, m1
+    pavgw               m0, m15
+    mova          [dstq+strideq*0], xm0
+    vextracti32x4 [dstq+strideq*1], ym0, 1
+    vextracti32x4 [dstq+strideq*2], m0, 2
+    vextracti32x4 [dstq+stride3q ], m0, 3
+    lea               dstq, [dstq+strideq*4]
+    sub                 hd, 4*2
+    jg .w8_loop
+    RET
+.w16:
+    pmovzxwd            m5, [tlq+hq+2]
+    mova                m6, [base+smooth_weights_2d_16bpc+16*4]
+    vpblendmw       m5{k1}, m0, m5       ; top, bottom
+.w16_loop:
+    vpbroadcastd        m0, [v_weightsq+0]
+    vpbroadcastd        m1, [v_weightsq+4]
+    pmaddwd             m0, m5
+    pmaddwd             m1, m5
+    mova                m2, m13
+    vpbroadcastw    m2{k1}, [tlq+hq-2] ; left, right
+    mova                m3, m13
+    vpbroadcastw    m3{k1}, [tlq+hq-4]
+    vpdpwssd            m0, m2, m6
+    vpdpwssd            m1, m3, m6
+    add         v_weightsq, 2*4
+    vpermt2b            m0, m14, m1
+    pavgw               m0, m15
+    mova          [dstq+strideq*0], ym0
+    vextracti32x8 [dstq+strideq*1], m0, 1
+    lea               dstq, [dstq+strideq*2]
+    sub                 hq, 2*2
+    jg .w16_loop
+    RET
+.w32:
+    pmovzxwd            m5, [tlq+hq+ 2]
+    pmovzxwd            m6, [tlq+hq+34]
+    mova                m7, [base+smooth_weights_2d_16bpc+32*4]
+    mova                m8, [base+smooth_weights_2d_16bpc+32*6]
+    vpblendmw       m5{k1}, m0, m5       ; top, bottom
+    vpblendmw       m6{k1}, m0, m6
+.w32_loop:
+    vpbroadcastd        m2, [v_weightsq+0]
+    vpbroadcastd        m3, [v_weightsq+4]
+    pmaddwd             m0, m5, m2
+    pmaddwd             m2, m6
+    pmaddwd             m1, m5, m3
+    pmaddwd             m3, m6
+    mova                m4, m13
+    vpbroadcastw    m4{k1}, [tlq+hq-2] ; left, right
+    vpdpwssd            m0, m4, m7
+    vpdpwssd            m2, m4, m8
+    mova                m4, m13
+    vpbroadcastw    m4{k1}, [tlq+hq-4]
+    vpdpwssd            m1, m4, m7
+    vpdpwssd            m3, m4, m8
+    add         v_weightsq, 2*4
+    vpermt2b            m0, m14, m2
+    vpermt2b            m1, m14, m3
+    pavgw               m0, m15
+    pavgw               m1, m15
+    mova  [dstq+strideq*0], m0
+    mova  [dstq+strideq*1], m1
+    lea               dstq, [dstq+strideq*2]
+    sub                 hq, 2*2
+    jg .w32_loop
+    RET
+.w64:
+    pmovzxwd            m5, [tlq+hq+ 2]
+    pmovzxwd            m6, [tlq+hq+34]
+    pmovzxwd            m7, [tlq+hq+66]
+    pmovzxwd            m8, [tlq+hq+98]
+    mova                m9, [base+smooth_weights_2d_16bpc+64*4]
+    vpblendmw       m5{k1}, m0, m5       ; top, bottom
+    mova               m10, [base+smooth_weights_2d_16bpc+64*5]
+    vpblendmw       m6{k1}, m0, m6
+    mova               m11, [base+smooth_weights_2d_16bpc+64*6]
+    vpblendmw       m7{k1}, m0, m7
+    mova               m12, [base+smooth_weights_2d_16bpc+64*7]
+    vpblendmw       m8{k1}, m0, m8
+.w64_loop:
+    vpbroadcastd        m3, [v_weightsq]
+    mova                m4, m13
+    vpbroadcastw    m4{k1}, [tlq+hq-2] ; left, right
+    pmaddwd             m0, m5, m3
+    pmaddwd             m2, m6, m3
+    pmaddwd             m1, m7, m3
+    pmaddwd             m3, m8
+    vpdpwssd            m0, m4, m9
+    vpdpwssd            m2, m4, m10
+    vpdpwssd            m1, m4, m11
+    vpdpwssd            m3, m4, m12
+    add         v_weightsq, 1*4
+    vpermt2b            m0, m14, m2
+    vpermt2b            m1, m14, m3
+    pavgw               m0, m15
+    pavgw               m1, m15
+    mova       [dstq+64*0], m0
+    mova       [dstq+64*1], m1
+    add               dstq, strideq
+    sub                 hd, 1*2
+    jg .w64_loop
+    RET
+
+cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3
+    lea                  r6, [pal_pred_16bpc_avx512icl_table]
+    tzcnt                wd, wm
+    mova                 m2, [pal_pred_perm]
+    movsxd               wq, [r6+wq*4]
+    mova                xm3, [palq]
+    movifnidn            hd, hm
+    add                  wq, r6
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+.w4:
+    pmovzxbw            ym0, [idxq]
+    add                idxq, 16
+    vpermw              ym0, ym0, ym3
+    vextracti32x4      xmm1, ym0, 1
+    movq   [dstq+strideq*0], xm0
+    movhps [dstq+strideq*1], xm0
+    movq   [dstq+strideq*2], xmm1
+    movhps [dstq+stride3q ], xmm1
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w4
+    RET
+.w8:
+    pmovzxbw             m0, [idxq]
+    add                idxq, 32
+    vpermw               m0, m0, m3
+    mova          [dstq+strideq*0], xm0
+    vextracti32x4 [dstq+strideq*1], ym0, 1
+    vextracti32x4 [dstq+strideq*2], m0, 2
+    vextracti32x4 [dstq+stride3q ], m0, 3
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w8
+    RET
+.w16:
+    vpermb               m1, m2, [idxq]
+    add                idxq, 64
+    vpermw               m0, m1, m3
+    psrlw                m1, 8
+    vpermw               m1, m1, m3
+    mova          [dstq+strideq*0], ym0
+    vextracti32x8 [dstq+strideq*1], m0, 1
+    mova          [dstq+strideq*2], ym1
+    vextracti32x8 [dstq+stride3q ], m1, 1
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w16
+    RET
+.w32:
+    vpermb               m1, m2, [idxq]
+    add                idxq, 64
+    vpermw               m0, m1, m3
+    psrlw                m1, 8
+    vpermw               m1, m1, m3
+    mova   [dstq+strideq*0], m0
+    mova   [dstq+strideq*1], m1
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
+    jg .w32
+    RET
+.w64:
+    vpermb               m1, m2, [idxq]
+    add                idxq, 64
+    vpermw               m0, m1, m3
+    psrlw                m1, 8
+    vpermw               m1, m1, m3
+    mova        [dstq+64*0], m0
+    mova        [dstq+64*1], m1
+    add                dstq, strideq
+    dec                  hd
+    jg .w64
+    RET
+
+; The ipred_filter SIMD processes 4x2 blocks in the following order which
+; increases parallelism compared to doing things row by row.
+;     w4     w8       w16             w32
+;     1     1 2     1 2 5 6     1 2 5 6 9 a d e
+;     2     2 3     2 3 6 7     2 3 6 7 a b e f
+;     3     3 4     3 4 7 8     3 4 7 8 b c f g
+;     4     4 5     4 5 8 9     4 5 8 9 c d g h
+
+cglobal ipred_filter_16bpc, 4, 7, 14, dst, stride, tl, w, h, filter, top
+%define base r6-$$
+    lea                  r6, [$$]
+%ifidn filterd, filterm
+    movzx           filterd, filterb
+%else
+    movzx           filterd, byte filterm
+%endif
+    shl             filterd, 6
+    movifnidn            hd, hm
+    movu                xm0, [tlq-6]
+    pmovsxbw             m7, [base+filter_intra_taps+filterq+32*0]
+    pmovsxbw             m8, [base+filter_intra_taps+filterq+32*1]
+    mov                 r5d, r8m ; bitdepth_max
+    movsldup             m9, [base+filter_permA]
+    movshdup            m10, [base+filter_permA]
+    shr                 r5d, 11  ; is_12bpc
+    jnz .12bpc
+    psllw                m7, 2   ; upshift multipliers so that packusdw
+    psllw                m8, 2   ; will perform clipping for free
+.12bpc:
+    vpbroadcastd         m5, [base+filter_rnd+r5*8]
+    vpbroadcastd         m6, [base+filter_shift+r5*8]
+    sub                  wd, 8
+    jl .w4
+.w8:
+    call .main4
+    movsldup            m11, [filter_permB]
+    lea                 r5d, [hq*2+2]
+    movshdup            m12, [filter_permB]
+    lea                topq, [tlq+2]
+    mova                m13, [filter_permC]
+    sub                  hd, 4
+    vinserti32x4        ym0, [topq], 1 ; a0 b0   t0 t1
+    sub                 tlq, r5
+%if WIN64
+    push                 r7
+    push                 r8
+%endif
+    mov                  r7, dstq
+    mov                 r8d, hd
+.w8_loop:
+    movlps              xm4, xm0, [tlq+hq*2]
+    call .main8
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
+    jge .w8_loop
+    test                 wd, wd
+    jz .end
+    mov                 r2d, 0x0d
+    kmovb                k1, r2d
+    lea                  r2, [strideq*3]
+.w16:
+    movd               xmm0, [r7+strideq*1+12]
+    vpblendd           xmm0, [topq+8], 0x0e ; t1 t2
+    pinsrw              xm4, xmm0, [r7+strideq*0+14], 2
+    call .main8
+    add                  r7, 16
+    vinserti32x4        ym0, [topq+16], 1   ; a2 b2   t2 t3
+    mov                  hd, r8d
+    mov                dstq, r7
+    add                topq, 16
+.w16_loop:
+    movd               xmm1, [dstq+strideq*2-4]
+    punpcklwd           xm4, xmm1, xmm0
+    movd               xmm0, [dstq+r2-4]
+    shufps          xm4{k1}, xmm0, xm0, q3210
+    call .main8
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
+    jge .w16_loop
+    sub                  wd, 8
+    jg .w16
+.end:
+    vpermb               m2, m11, m0
+    mova                ym1, ym5
+    vpdpwssd             m1, m2, m7
+    vpermb               m2, m12, m0
+    vpdpwssd             m1, m2, m8
+%if WIN64
+    pop                  r8
+    pop                  r7
+%endif
+    vextracti32x8       ym2, m1, 1
+    paddd               ym1, ym2
+    packusdw            ym1, ym1
+    vpsrlvw             ym1, ym6
+    vpermt2q             m0, m13, m1
+    vextracti32x4 [dstq+strideq*0], m0, 2
+    vextracti32x4 [dstq+strideq*1], ym0, 1
+    RET
+.w4_loop:
+    movlps              xm0, [tlq-10]
+    lea                dstq, [dstq+strideq*2]
+    sub                 tlq, 4
+.w4:
+    call .main4
+    movq   [dstq+strideq*0], xm0
+    movhps [dstq+strideq*1], xm0
+    sub                  hd, 2
+    jg .w4_loop
+    RET
+ALIGN function_align
+.main4:
+    vpermb               m2, m9, m0
+    mova                ym1, ym5
+    vpdpwssd             m1, m2, m7
+    vpermb               m0, m10, m0
+    vpdpwssd             m1, m0, m8
+    vextracti32x8       ym0, m1, 1
+    paddd               ym0, ym1
+    vextracti32x4       xm1, ym0, 1
+    packusdw            xm0, xm1     ; clip
+    vpsrlvw             xm0, xm6
+    ret
+ALIGN function_align
+.main8:
+    vpermb               m3, m11, m0
+    mova                ym2, ym5
+    vpdpwssd             m2, m3, m7
+    vpermb               m3, m9, m4
+    mova                ym1, ym5
+    vpdpwssd             m1, m3, m7
+    vpermb               m3, m12, m0
+    vpdpwssd             m2, m3, m8
+    vpermb               m3, m10, m4
+    vpdpwssd             m1, m3, m8
+    vextracti32x8       ym4, m2, 1
+    vextracti32x8       ym3, m1, 1
+    paddd               ym2, ym4
+    paddd               ym1, ym3
+    packusdw            ym1, ym2     ; clip
+    vpsrlvw             ym1, ym6
+    vpermt2q             m0, m13, m1 ; c0 d0   b0 b1   a0 a1
+    vextracti32x4 [dstq+strideq*0], m0, 2
+    vextracti32x4 [dstq+strideq*1], ym0, 1
+    ret
+
+%endif
diff --git a/third_party/dav1d/src/x86/ipred16_sse.asm b/third_party/dav1d/src/x86/ipred16_sse.asm
index eaa56b67bcf05..07ea9567e1ab9 100644
--- a/third_party/dav1d/src/x86/ipred16_sse.asm
+++ b/third_party/dav1d/src/x86/ipred16_sse.asm
@@ -70,14 +70,6 @@ cextern filter_intra_taps
 
 SECTION .text
 
-%macro REPX 2-*
-    %xdefine %%f(x) %1
-%rep %0 - 1
-    %rotate 1
-    %%f(%1)
-%endrep
-%endmacro
-
 INIT_XMM ssse3
 cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h
     LEA                  r5, ipred_dc_left_16bpc_ssse3_table
diff --git a/third_party/dav1d/src/x86/ipred_init_tmpl.c b/third_party/dav1d/src/x86/ipred_init_tmpl.c
index 3f1a3493c2551..0ba0a41088001 100644
--- a/third_party/dav1d/src/x86/ipred_init_tmpl.c
+++ b/third_party/dav1d/src/x86/ipred_init_tmpl.c
@@ -134,6 +134,7 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c
     init_angular_ipred_fn(LEFT_DC_PRED,  ipred_dc_left,  avx512icl);
     init_angular_ipred_fn(HOR_PRED,      ipred_h,        avx512icl);
     init_angular_ipred_fn(VERT_PRED,     ipred_v,        avx512icl);
+#endif
     init_angular_ipred_fn(PAETH_PRED,    ipred_paeth,    avx512icl);
     init_angular_ipred_fn(SMOOTH_PRED,   ipred_smooth,   avx512icl);
     init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx512icl);
@@ -142,5 +143,4 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c
 
     c->pal_pred = BF(dav1d_pal_pred, avx512icl);
 #endif
-#endif
 }
diff --git a/third_party/dav1d/src/x86/itx16_avx2.asm b/third_party/dav1d/src/x86/itx16_avx2.asm
index 071ecbc33a268..c580944c7bbf5 100644
--- a/third_party/dav1d/src/x86/itx16_avx2.asm
+++ b/third_party/dav1d/src/x86/itx16_avx2.asm
@@ -145,14 +145,6 @@ cextern iadst_16x16_internal_8bpc_avx2.main_pass2_end
 
 SECTION .text
 
-%macro REPX 2-*
-    %xdefine %%f(x) %1
-%rep %0 - 1
-    %rotate 1
-    %%f(%1)
-%endrep
-%endmacro
-
 %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
 
 %macro WRAP_XMM 1+
diff --git a/third_party/dav1d/src/x86/itx16_sse.asm b/third_party/dav1d/src/x86/itx16_sse.asm
index fa8724691f3ae..4fb30ef4e7a6f 100644
--- a/third_party/dav1d/src/x86/itx16_sse.asm
+++ b/third_party/dav1d/src/x86/itx16_sse.asm
@@ -174,14 +174,6 @@ tbl_Nx64_offset: db 2* 0, 2*32, 2*16, 2*46
 
 SECTION .text
 
-%macro REPX 2-*
-    %xdefine %%f(x) %1
-%rep %0 - 1
-    %rotate 1
-    %%f(%1)
-%endrep
-%endmacro
-
 %define m_suffix(x, sfx) mangle(private_prefix %+ _ %+ x %+ sfx)
 %define m(x) m_suffix(x, SUFFIX)
 
diff --git a/third_party/dav1d/src/x86/itx_avx2.asm b/third_party/dav1d/src/x86/itx_avx2.asm
index 9cd66443482e8..092c842786dfe 100644
--- a/third_party/dav1d/src/x86/itx_avx2.asm
+++ b/third_party/dav1d/src/x86/itx_avx2.asm
@@ -132,15 +132,6 @@ SECTION .text
 ; 1-byte offsets as long as data is within +-128 bytes of the base pointer.
 %define o_base deint_shuf + 128
 %define o(x) (r6 - (o_base) + (x))
-
-%macro REPX 2-*
-    %xdefine %%f(x) %1
-%rep %0 - 1
-    %rotate 1
-    %%f(%1)
-%endrep
-%endmacro
-
 %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
 
 ; flags: 1 = swap, 2 = interleave, 4: coef_regs
diff --git a/third_party/dav1d/src/x86/itx_avx512.asm b/third_party/dav1d/src/x86/itx_avx512.asm
index 32b68b1548367..7d01bccb4f52c 100644
--- a/third_party/dav1d/src/x86/itx_avx512.asm
+++ b/third_party/dav1d/src/x86/itx_avx512.asm
@@ -242,15 +242,6 @@ SECTION .text
 
 %define o_base int8_permA+64*18
 %define o(x) (r5 - (o_base) + (x))
-
-%macro REPX 2-*
-    %xdefine %%f(x) %1
-%rep %0 - 1
-    %rotate 1
-    %%f(%1)
-%endrep
-%endmacro
-
 %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
 
 ; flags: 1 = swap, 2 = interleave (l), 4 = interleave (t), 8 = no_pack,
diff --git a/third_party/dav1d/src/x86/itx_init_tmpl.c b/third_party/dav1d/src/x86/itx_init_tmpl.c
index 251d77e4fa1ee..467d38293209d 100644
--- a/third_party/dav1d/src/x86/itx_init_tmpl.c
+++ b/third_party/dav1d/src/x86/itx_init_tmpl.c
@@ -278,7 +278,27 @@ COLD void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c,
 
     if (bpc > 10) return;
 
-#if BITDEPTH == 16
+#if BITDEPTH == 8
+    assign_itx16_fn( ,  4,  4, avx2);
+    assign_itx16_fn(R,  4,  8, avx2);
+    assign_itx16_fn(R,  4, 16, avx2);
+    assign_itx16_fn(R,  8,  4, avx2);
+    assign_itx16_fn( ,  8,  8, avx2);
+    assign_itx16_fn(R,  8, 16, avx2);
+    assign_itx2_fn (R,  8, 32, avx2);
+    assign_itx16_fn(R, 16,  4, avx2);
+    assign_itx16_fn(R, 16,  8, avx2);
+    assign_itx12_fn( , 16, 16, avx2);
+    assign_itx2_fn (R, 16, 32, avx2);
+    assign_itx1_fn (R, 16, 64, avx2);
+    assign_itx2_fn (R, 32,  8, avx2);
+    assign_itx2_fn (R, 32, 16, avx2);
+    assign_itx2_fn ( , 32, 32, avx2);
+    assign_itx1_fn (R, 32, 64, avx2);
+    assign_itx1_fn (R, 64, 16, avx2);
+    assign_itx1_fn (R, 64, 32, avx2);
+    assign_itx1_fn ( , 64, 64, avx2);
+#elif BITDEPTH == 16
     assign_itx16_bpc_fn( ,  4,  4, 10, avx2);
     assign_itx16_bpc_fn(R,  4,  8, 10, avx2);
     assign_itx16_bpc_fn(R,  4, 16, 10, avx2);
diff --git a/third_party/dav1d/src/x86/itx_sse.asm b/third_party/dav1d/src/x86/itx_sse.asm
index 7cbd9c3f3be59..ec7e3a52f4676 100644
--- a/third_party/dav1d/src/x86/itx_sse.asm
+++ b/third_party/dav1d/src/x86/itx_sse.asm
@@ -142,14 +142,6 @@ pw_m301x8:      times 8 dw  -301*8
 
 SECTION .text
 
-%macro REPX 2-*
-    %xdefine %%f(x) %1
-%rep %0 - 1
-    %rotate 1
-    %%f(%1)
-%endrep
-%endmacro
-
 %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
 
 %if ARCH_X86_64
@@ -2388,7 +2380,7 @@ INV_TXFM_8X16_FN identity, identity
 cglobal iidentity_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS    coeffq+16*1, 32, 1
     mov                    r3, tx2q
-    lea                  tx2q, [o(m(iidentity_8x16_internal_8bpc).pass1_end)]
+    lea                  tx2q, [o(.pass1_end)]
     mova   [rsp+gprsize+16*1], m6
     jmp  m(idct_8x8_internal_8bpc).pass1_end3
 
@@ -2400,7 +2392,7 @@ cglobal iidentity_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp  m(idct_8x8_internal_8bpc).pass1_end3
 
 .pass2:
-    lea                  tx2q, [o(m(iidentity_8x16_internal_8bpc).end1)]
+    lea                  tx2q, [o(.end1)]
 
 .end:
     mova   [rsp+gprsize+16*0], m7
@@ -2456,7 +2448,7 @@ cglobal idct_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS    coeffq+16*1, 32, 1
     call  .main
     mov                    r3, tx2q
-    lea                  tx2q, [o(m(idct_16x8_internal_8bpc).pass1_end)]
+    lea                  tx2q, [o(.pass1_end)]
     jmp  m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end:
@@ -2467,7 +2459,7 @@ cglobal idct_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp  m(idct_8x8_internal_8bpc).pass1_end
 
 .pass2:
-    lea                  tx2q, [o(m(idct_16x8_internal_8bpc).end)]
+    lea                  tx2q, [o(.end)]
     lea                    r3, [dstq+8]
     jmp  m(idct_8x8_internal_8bpc).pass2_main
 
@@ -2595,7 +2587,7 @@ cglobal iadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     call .main
     call .main_pass1_end
     mov                    r3, tx2q
-    lea                  tx2q, [o(m(iadst_16x8_internal_8bpc).pass1_end)]
+    lea                  tx2q, [o(.pass1_end)]
     jmp m(iadst_8x8_internal_8bpc).pass1_end
 
 .pass1_end:
@@ -2606,7 +2598,7 @@ cglobal iadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp m(iadst_8x8_internal_8bpc).pass1_end
 
 .pass2:
-    lea                  tx2q, [o(m(iadst_16x8_internal_8bpc).end)]
+    lea                  tx2q, [o(.end)]
     lea                    r3, [dstq+8]
     jmp m(iadst_8x8_internal_8bpc).pass2_main
 
@@ -2880,7 +2872,7 @@ cglobal iflipadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
     mov                     r3, tx2q
-    lea                   tx2q, [o(m(iflipadst_16x8_internal_8bpc).pass1_end)]
+    lea                   tx2q, [o(.pass1_end)]
     jmp m(iflipadst_8x8_internal_8bpc).pass1_end
 
 .pass1_end:
@@ -2891,7 +2883,7 @@ cglobal iflipadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp m(iflipadst_8x8_internal_8bpc).pass1_end
 
 .pass2:
-    lea                   tx2q, [o(m(iflipadst_16x8_internal_8bpc).end)]
+    lea                   tx2q, [o(.end)]
     lea                     r3, [dstq+8]
     jmp m(iflipadst_8x8_internal_8bpc).pass2_main
 
@@ -2914,7 +2906,7 @@ cglobal iidentity_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     mova                   m6, [coeffq-16*3]
     mova                   m7, [coeffq-16*1]
     mov                    r3, tx2q
-    lea                  tx2q, [o(m(iidentity_16x8_internal_8bpc).pass1_end)]
+    lea                  tx2q, [o(.pass1_end)]
 
 .pass1:
     mova                   m0, [o(pw_2896x8)]
@@ -2972,7 +2964,7 @@ cglobal iidentity_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp .pass1
 
 .pass2:
-    lea                  tx2q, [o(m(iidentity_16x8_internal_8bpc).end)]
+    lea                  tx2q, [o(.end)]
     lea                    r3, [dstq+8]
     jmp  m(iidentity_8x8_internal_8bpc).end
 
@@ -3010,7 +3002,7 @@ cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS     coeffq+16*3, 64
     call m(idct_16x8_internal_8bpc).main
     mov                     r3, tx2q
-    lea                   tx2q, [o(m(idct_16x16_internal_8bpc).pass1_end)]
+    lea                   tx2q, [o(.pass1_end)]
     mova                    m7, [o(pw_8192)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
@@ -3018,7 +3010,7 @@ cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     SAVE_8ROWS    coeffq+16*17, 32
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_16x16_internal_8bpc).pass1_end1)]
+    lea                   tx2q, [o(.pass1_end1)]
     mova                    m7, [o(pw_8192)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
@@ -3029,7 +3021,7 @@ cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     SAVE_7ROWS    rsp+gprsize+16*3, 16
     LOAD_8ROWS     coeffq+16*2, 64
     call m(idct_16x8_internal_8bpc).main
-    lea                   tx2q, [o(m(idct_16x16_internal_8bpc).pass1_end2)]
+    lea                   tx2q, [o(.pass1_end2)]
     mova                    m7, [o(pw_8192)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
@@ -3042,13 +3034,13 @@ cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass2:
-    lea                   tx2q, [o(m(idct_16x16_internal_8bpc).end)]
+    lea                   tx2q, [o(.end)]
     jmp  m(idct_8x16_internal_8bpc).pass2_pre
 
 .end:
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_16x16_internal_8bpc).end1)]
+    lea                   tx2q, [o(.end1)]
     mov                   dstq, r3
     lea                     r3, [dstq+8]
     jmp   m(idct_8x8_internal_8bpc).end
@@ -3136,7 +3128,7 @@ cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     call m(iadst_16x8_internal_8bpc).main_pass1_end
 
     mov                     r3, tx2q
-    lea                   tx2q, [o(m(iadst_16x16_internal_8bpc).pass1_end)]
+    lea                   tx2q, [o(.pass1_end)]
     mova                    m7, [o(pw_8192)]
     jmp  m(iadst_8x8_internal_8bpc).pass1_end1
 
@@ -3144,7 +3136,7 @@ cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     SAVE_8ROWS    coeffq+16*17, 32
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(iadst_16x16_internal_8bpc).pass1_end1)]
+    lea                   tx2q, [o(.pass1_end1)]
     mova                    m7, [o(pw_8192)]
     jmp  m(iadst_8x8_internal_8bpc).pass1_end1
 
@@ -3154,7 +3146,7 @@ cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     call m(iadst_16x8_internal_8bpc).main
     call m(iadst_16x8_internal_8bpc).main_pass1_end
 
-    lea                   tx2q, [o(m(iadst_16x16_internal_8bpc).pass1_end2)]
+    lea                   tx2q, [o(.pass1_end2)]
     mova                    m7, [o(pw_8192)]
     jmp  m(iadst_8x8_internal_8bpc).pass1_end1
 
@@ -3167,13 +3159,13 @@ cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp  m(iadst_8x8_internal_8bpc).pass1_end1
 
 .pass2:
-    lea                   tx2q, [o(m(iadst_16x16_internal_8bpc).end)]
+    lea                   tx2q, [o(.end)]
     jmp m(iadst_8x16_internal_8bpc).pass2_pre
 
 .end:
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(iadst_16x16_internal_8bpc).end1)]
+    lea                   tx2q, [o(.end1)]
     mov                   dstq, r3
     lea                     r3, [dstq+8]
     jmp  m(iadst_8x8_internal_8bpc).end
@@ -3211,7 +3203,7 @@ cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     call m(iadst_16x8_internal_8bpc).main_pass1_end
 
     mov                     r3, tx2q
-    lea                   tx2q, [o(m(iflipadst_16x16_internal_8bpc).pass1_end)]
+    lea                   tx2q, [o(.pass1_end)]
     mova                    m7, [o(pw_m8192)]
     jmp  m(iflipadst_8x8_internal_8bpc).pass1_end1
 
@@ -3219,7 +3211,7 @@ cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     SAVE_8ROWS     coeffq+16*1, 32
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(iflipadst_16x16_internal_8bpc).pass1_end1)]
+    lea                   tx2q, [o(.pass1_end1)]
     mova                    m7, [o(pw_m8192)]
     jmp  m(iflipadst_8x8_internal_8bpc).pass1_end1
 
@@ -3233,7 +3225,7 @@ cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     SAVE_8ROWS     coeffq+16*0, 32
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(iflipadst_16x16_internal_8bpc).pass1_end2)]
+    lea                   tx2q, [o(.pass1_end2)]
     mova                    m7, [o(pw_m8192)]
     jmp  m(iflipadst_8x8_internal_8bpc).pass1_end1
 
@@ -3246,14 +3238,14 @@ cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp m(iflipadst_8x8_internal_8bpc).pass1_end1
 
 .pass2:
-    lea                   tx2q, [o(m(iflipadst_16x16_internal_8bpc).end)]
+    lea                   tx2q, [o(.end)]
     lea                     r3, [dstq+8]
     jmp m(iflipadst_8x16_internal_8bpc).pass2_pre
 
 .end:
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(iflipadst_16x16_internal_8bpc).end1)]
+    lea                   tx2q, [o(.end1)]
     lea                   dstq, [dstq+strideq*2]
     jmp  m(iflipadst_8x8_internal_8bpc).end
 
@@ -3276,7 +3268,7 @@ cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     mova    [rsp+gprsize+16*5], m6
     mova    [rsp+gprsize+16*6], m7
 
-    lea                   tx2q, [o(m(iflipadst_16x16_internal_8bpc).end2)]
+    lea                   tx2q, [o(.end2)]
     mov                   dstq, r3
     jmp m(iflipadst_8x16_internal_8bpc).pass2_main
 
@@ -3300,7 +3292,7 @@ INV_TXFM_16X16_FN identity, identity
 cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     add                 coeffq, 16*17
     mov                     r3, tx2q
-    lea                   tx2q, [o(m(iidentity_16x16_internal_8bpc).pass1_end)]
+    lea                   tx2q, [o(.pass1_end)]
 
 .pass1:
     mova                    m6, [o(pw_1697x16)]
@@ -3321,13 +3313,13 @@ cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
 .pass1_end:
     SAVE_8ROWS          coeffq, 32
     sub                 coeffq, 16
-    lea                   tx2q, [o(m(iidentity_16x16_internal_8bpc).pass1_end1)]
+    lea                   tx2q, [o(.pass1_end1)]
     jmp .pass1
 
 .pass1_end1:
     SAVE_8ROWS          coeffq, 32
     sub                 coeffq, 15*16
-    lea                   tx2q, [o(m(iidentity_16x16_internal_8bpc).pass1_end2)]
+    lea                   tx2q, [o(.pass1_end2)]
     jmp .pass1
 
 .pass1_end2:
@@ -3338,7 +3330,7 @@ cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
 
 .pass2:
     lea                     r3, [dstq+8]
-    lea                   tx2q, [o(m(iidentity_16x16_internal_8bpc).end1)]
+    lea                   tx2q, [o(.end1)]
 
 .end:
     mova    [rsp+gprsize+16*0], m7
@@ -3361,7 +3353,7 @@ cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
 
 .end1:
     LOAD_8ROWS     coeffq+16*1, 32
-    lea                   tx2q, [o(m(iidentity_16x16_internal_8bpc).end2)]
+    lea                   tx2q, [o(.end2)]
     lea                   dstq, [dstq+strideq*2]
     jmp .end
 
@@ -3371,7 +3363,7 @@ cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
 
     add                 coeffq, 32*8
     LOAD_8ROWS          coeffq, 32
-    lea                   tx2q, [o(m(iidentity_16x16_internal_8bpc).end3)]
+    lea                   tx2q, [o(.end3)]
     mov                   dstq, r3
     jmp .end
 
@@ -3403,7 +3395,7 @@ cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob,
     pshuflw              m0, m0, q0000
     punpcklwd            m0, m0
     mov                 r3d, 8
-    lea                tx2q, [o(m(inv_txfm_add_dct_dct_8x32_8bpc).end)]
+    lea                tx2q, [o(.end)]
     jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop
 
 .end:
@@ -3412,14 +3404,13 @@ cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob,
 
 
 cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
-    %undef cmp
     cmp                   eobd, 106
     jle .fast
 
     LOAD_8ROWS     coeffq+16*3, 64
     call  m(idct_8x8_internal_8bpc).main
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_8x32_internal_8bpc).pass1)]
+    lea                   tx2q, [o(.pass1)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1:
@@ -3434,7 +3425,7 @@ cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS     coeffq+16*2, 64
     call  m(idct_8x8_internal_8bpc).main
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_8x32_internal_8bpc).pass1_1)]
+    lea                   tx2q, [o(.pass1_1)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_1:
@@ -3451,7 +3442,7 @@ cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS     coeffq+16*1, 64
     call  m(idct_8x8_internal_8bpc).main
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_8x32_internal_8bpc).pass1_end)]
+    lea                   tx2q, [o(.pass1_end)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end:
@@ -3466,7 +3457,7 @@ cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS     coeffq+16*0, 64
     call  m(idct_8x8_internal_8bpc).main
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_8x32_internal_8bpc).pass1_end1)]
+    lea                   tx2q, [o(.pass1_end1)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end1:
@@ -3514,11 +3505,11 @@ cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     call .main
 
 .pass2:
-    lea                     r3, [o(m(idct_8x32_internal_8bpc).end6)]
+    lea                     r3, [o(.end6)]
 
 .end:
     mova   [rsp+gprsize+16*0 ], m7
-    lea                   tx2q, [o(m(idct_8x32_internal_8bpc).end2)]
+    lea                   tx2q, [o(.end2)]
 
 .end1:
     pxor                    m7, m7
@@ -3530,21 +3521,21 @@ cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp                   tx2q
 
 .end2:
-    lea                   tx2q, [o(m(idct_8x32_internal_8bpc).end3)]
+    lea                   tx2q, [o(.end3)]
     jmp   m(idct_8x8_internal_8bpc).end
 
 .end3:
     LOAD_8ROWS   rsp+gprsize+16*11, 16
     mova   [rsp+gprsize+16*0 ], m7
     lea                   dstq, [dstq+strideq*2]
-    lea                   tx2q, [o(m(idct_8x32_internal_8bpc).end4)]
+    lea                   tx2q, [o(.end4)]
     jmp   m(idct_8x8_internal_8bpc).end
 
 .end4:
     LOAD_8ROWS   rsp+gprsize+16*19, 16
     mova   [rsp+gprsize+16*0 ], m7
     lea                   dstq, [dstq+strideq*2]
-    lea                   tx2q, [o(m(idct_8x32_internal_8bpc).end5)]
+    lea                   tx2q, [o(.end5)]
     jmp   m(idct_8x8_internal_8bpc).end
 
 .end5:
@@ -3883,7 +3874,7 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob,
     movd                    m2, [o(pw_8192)]
     mov               [coeffq], eobd
     mov                    r3d, 8
-    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)]
+    lea                   tx2q, [o(.end)]
 
 .body:
     pmulhrsw                m0, m2
@@ -3919,7 +3910,6 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob,
 
 
 cglobal idct_32x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
-    %undef cmp
     LOAD_8ROWS     coeffq+16*0, 64
     call  m(idct_8x8_internal_8bpc).main
     SAVE_7ROWS    rsp+gprsize+16*3, 16
@@ -3958,55 +3948,55 @@ cglobal idct_32x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
 
 .pass2:
     mova   [rsp+gprsize+16*0 ], m7
-    lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end)]
+    lea                   tx2q, [o(.end)]
     jmp  m(idct_8x32_internal_8bpc).end1
 
 .end:
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end1)]
+    lea                   tx2q, [o(.end1)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .end1:
     lea                     r3, [dstq+8]
-    lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end2)]
+    lea                   tx2q, [o(.end2)]
     jmp   m(idct_8x8_internal_8bpc).pass2_main
 
 .end2:
     LOAD_8ROWS   rsp+gprsize+16*11, 16
     mova   [rsp+gprsize+16*0 ], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end3)]
+    lea                   tx2q, [o(.end3)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .end3:
     mov                   dstq, r3
     add                     r3, 8
-    lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end4)]
+    lea                   tx2q, [o(.end4)]
     jmp   m(idct_8x8_internal_8bpc).pass2_main
 
 .end4:
     LOAD_8ROWS   rsp+gprsize+16*19, 16
     mova   [rsp+gprsize+16*0 ], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end5)]
+    lea                   tx2q, [o(.end5)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .end5:
     mov                   dstq, r3
     add                     r3, 8
-    lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end6)]
+    lea                   tx2q, [o(.end6)]
     jmp   m(idct_8x8_internal_8bpc).pass2_main
 
 .end6:
     LOAD_8ROWS   rsp+gprsize+16*27, 16
     mova   [rsp+gprsize+16*0 ], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end7)]
+    lea                   tx2q, [o(.end7)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .end7:
     mov                   dstq, r3
-    lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end8)]
+    lea                   tx2q, [o(.end8)]
     jmp   m(idct_8x8_internal_8bpc).pass2_main
 
 .end8:
@@ -4085,6 +4075,7 @@ cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob
     test                  eobd, eobd
     jz .dconly
     call  m(idct_16x32_internal_8bpc)
+.end:
     RET
 
 .dconly:
@@ -4094,28 +4085,24 @@ cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob
     mov               [coeffq], eobd
     pmulhrsw                m0, m1
     mov                    r2d, 16
-    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_16x32_8bpc).end)]
+    lea                   tx2q, [o(.end)]
     jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
 
-.end:
-    RET
 
 cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
-    %undef cmp
-
     LOAD_8ROWS     coeffq+16*1, 128, 1
     call  m(idct_8x8_internal_8bpc).main
     SAVE_7ROWS    rsp+gprsize+16*3, 16
     LOAD_8ROWS     coeffq+16*5, 128, 1
     call m(idct_16x8_internal_8bpc).main
-    lea                   tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end)]
+    lea                   tx2q, [o(.pass1_end)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end:
     SAVE_8ROWS    coeffq+16*33, 64               ;in8~in15
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end1)]
+    lea                   tx2q, [o(.pass1_end1)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end1:
@@ -4132,14 +4119,14 @@ cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     SAVE_7ROWS    rsp+gprsize+16*3, 16
     LOAD_8ROWS     coeffq+16*4, 128, 1
     call m(idct_16x8_internal_8bpc).main
-    lea                   tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end2)]
+    lea                   tx2q, [o(.pass1_end2)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end2:
     SAVE_8ROWS    coeffq+16*32, 64               ;in0~in7
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end3)]
+    lea                   tx2q, [o(.pass1_end3)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end3:
@@ -4182,14 +4169,14 @@ cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     SAVE_7ROWS    rsp+gprsize+16*3, 16
     LOAD_8ROWS     coeffq+16*6, 128, 1
     call m(idct_16x8_internal_8bpc).main
-    lea                   tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end4)]
+    lea                   tx2q, [o(.pass1_end4)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end4:
     SAVE_8ROWS    coeffq+16*34, 64               ;in16~in23
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end5)]
+    lea                   tx2q, [o(.pass1_end5)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end5:
@@ -4207,14 +4194,14 @@ cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     SAVE_7ROWS    rsp+gprsize+16*3, 16
     LOAD_8ROWS     coeffq+16*7, 128, 1
     call m(idct_16x8_internal_8bpc).main
-    lea                   tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end6)]
+    lea                   tx2q, [o(.pass1_end6)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end6:
     SAVE_8ROWS    coeffq+16*35, 64                        ;in24~in31
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end7)]
+    lea                   tx2q, [o(.pass1_end7)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end7:
@@ -4246,7 +4233,7 @@ cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     mov  [rsp+gprsize*1+16*35], eobd
     lea                     r3, [dstq+8]
     mov  [rsp+gprsize*2+16*35], r3
-    lea                     r3, [o(m(idct_16x32_internal_8bpc).end)]
+    lea                     r3, [o(.end)]
     jmp  m(idct_8x32_internal_8bpc).end
 
 .end:
@@ -4296,7 +4283,7 @@ cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     SAVE_8ROWS   rsp+gprsize+16*11, 16
 
     call m(idct_8x32_internal_8bpc).main_fast
-    jmp  .end1
+    jmp m(idct_8x32_internal_8bpc).pass2
 
 .full1:
     mova                    m4, [coeffq+16*2 ]            ;in16
@@ -4337,12 +4324,9 @@ cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     mova   [rsp+gprsize+16*34], m7                        ;in31
 
     call m(idct_8x32_internal_8bpc).main
-
-.end1:
     jmp m(idct_8x32_internal_8bpc).pass2
 
 
-
 cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
 %if ARCH_X86_32
     LEA                     r5, $$
@@ -4390,10 +4374,8 @@ cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob
 
 
 cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
-    %undef cmp
-
     add                 coeffq, 16
-    lea                     r3, [o(m(idct_32x16_internal_8bpc).pass1_end1)]
+    lea                     r3, [o(.pass1_end1)]
 .pass1:
     LOAD_8ROWS     coeffq+16*0, 128, 1
     call  m(idct_8x8_internal_8bpc).main
@@ -4434,28 +4416,28 @@ cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     SAVE_8ROWS     coeffq+16*0, 32
     LOAD_8ROWS   rsp+gprsize+16*11, 16
     mova   [rsp+gprsize+16*0 ], m7
-    lea                   tx2q, [o(m(idct_32x16_internal_8bpc).pass1_end2)]
+    lea                   tx2q, [o(.pass1_end2)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end2:
     SAVE_8ROWS    coeffq+16*16, 32
     LOAD_8ROWS   rsp+gprsize+16*19, 16
     mova   [rsp+gprsize+16*0 ], m7
-    lea                   tx2q, [o(m(idct_32x16_internal_8bpc).pass1_end3)]
+    lea                   tx2q, [o(.pass1_end3)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end3:
     SAVE_8ROWS    coeffq+16*32, 32
     LOAD_8ROWS   rsp+gprsize+16*27, 16
     mova   [rsp+gprsize+16*0 ], m7
-    lea                   tx2q, [o(m(idct_32x16_internal_8bpc).pass1_end4)]
+    lea                   tx2q, [o(.pass1_end4)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end4:
     SAVE_8ROWS    coeffq+16*48, 32
 
     sub                 coeffq, 16
-    lea                     r3, [o(m(idct_32x16_internal_8bpc).end)]
+    lea                     r3, [o(.end)]
     jmp .pass1
 
 .end:
@@ -4463,8 +4445,6 @@ cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
 
 
 cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
-    %undef cmp
-
     mov                    r4d, eobd
     cmp                   eobd, 43                ;if (eob > 43)
     sbb                    r3d, r3d               ;  iteration_count++
@@ -4528,8 +4508,6 @@ cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 6, 8, 16*4, dst, stride, c
 
 
 cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
-    %undef cmp
-
     mov                    r4d, 12                ;0100b
     mov                    r5d, 136               ;1000 1000b
     cmp                   eobd, 44                ;if (eob > 43)
@@ -4608,8 +4586,6 @@ cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob
 
 
 cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
-    %undef cmp
-
     mov                    r4d, 2
     sub                   eobd, 136
     mov  [rsp+gprsize*1+16*35], eobd
@@ -4684,7 +4660,7 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
 .pass1_end:
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end1)]
+    lea                   tx2q, [o(.pass1_end1)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end1:
@@ -4692,7 +4668,7 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*11, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end2)]
+    lea                   tx2q, [o(.pass1_end2)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end2:
@@ -4700,7 +4676,7 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*19, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end3)]
+    lea                   tx2q, [o(.pass1_end3)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end3:
@@ -4708,7 +4684,7 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*27, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end4)]
+    lea                   tx2q, [o(.pass1_end4)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end4:
@@ -4722,7 +4698,7 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
 .pass2:
     mov                 coeffq, [rsp+gprsize*2+16*35]
     mov                    r3d, 4
-    lea                   tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)]
+    lea                   tx2q, [o(.pass2_end)]
 
 .pass2_loop:
     mov  [rsp+gprsize*3+16*35], r3d
@@ -4818,11 +4794,11 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp                   tx2q
 
 .pass2_end:
-    lea                     r3, [o(m(idct_32x32_internal_8bpc).pass2_end1)]
+    lea                     r3, [o(.pass2_end1)]
     jmp  m(idct_8x32_internal_8bpc).end
 
 .pass2_end1:
-    lea                   tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)]
+    lea                   tx2q, [o(.pass2_end)]
     add                 coeffq, 16*32
     mov                   dstq, [rsp+gprsize*2+16*35]
     mov                    r3d, [rsp+gprsize*3+16*35]
@@ -4833,8 +4809,6 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
 
 
 cglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 8, 16*5, dst, stride, coeff, eob, tx2
-    %undef cmp
-
     mov                    r4d, 2
     cmp                   eobd, 136
     mov                    r3d, 4
@@ -4895,8 +4869,8 @@ cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob
 %endif
     test                  eobd, eobd
     jz .dconly
-
     call m(idct_16x64_internal_8bpc)
+.end:
     RET
 
 .dconly:
@@ -4905,16 +4879,11 @@ cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob
     movd                    m2, [o(pw_8192)]
     mov               [coeffq], eobd
     mov                    r2d, 32
-    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_16x64_8bpc).end)]
+    lea                   tx2q, [o(.end)]
     jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
 
-.end:
-    RET
-
 
 cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
-    %undef cmp
-
     mov                    r4d, 2
     sub                   eobd, 151
     mov  [rsp+gprsize*1+16*67], eobd
@@ -4934,7 +4903,7 @@ cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS     coeffq+64*1, 64*2
     call m(idct_16x8_internal_8bpc).main
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_16x64_internal_8bpc).pass1_end)]
+    lea                   tx2q, [o(.pass1_end)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end:
@@ -4942,7 +4911,7 @@ cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_16x64_internal_8bpc).pass1_end1)]
+    lea                   tx2q, [o(.pass1_end1)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end1:
@@ -4956,7 +4925,7 @@ cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     mov                    r3d, 2
     lea                     r4, [dstq+8]
     mov  [rsp+gprsize*2+16*67], r4
-    lea                     r4, [o(m(idct_16x64_internal_8bpc).end1)]
+    lea                     r4, [o(.end1)]
 
 .pass2_loop:
     mov  [rsp+gprsize*3+16*67], r3d
@@ -5082,23 +5051,47 @@ cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
 .end1:
     LOAD_8ROWS   rsp+gprsize+16*35, 16
     lea                   dstq, [dstq+strideq*2]
-    add                    rsp, 16*32
-    lea                     r3, [o(m(idct_16x64_internal_8bpc).end2)]
-    jmp  m(idct_8x32_internal_8bpc).end
-
-.end2:
-    add                 coeffq, 16*32
-    sub                    rsp, 16*32
-
+    lea                     r3, [rsp+16*32+gprsize]
+    call .write
     mov                   dstq, [rsp+gprsize*2+16*67]
     mov                    r3d, [rsp+gprsize*3+16*67]
     lea                     r4, [dstq+8]
     mov  [rsp+gprsize*2+16*67], r4
-    lea                     r4, [o(m(idct_16x64_internal_8bpc).end1)]
+    lea                     r4, [o(.end1)]
 
     dec                    r3d
     jg .pass2_loop
     ret
+.write:
+    mova             [r3+16*0], m7
+    mov                     r4, -16*32
+    pxor                    m7, m7
+    sub                 coeffq, r4
+.zero_loop:
+    mova      [coeffq+r4+16*0], m7
+    mova      [coeffq+r4+16*1], m7
+    add                     r4, 16*2
+    jl .zero_loop
+    call .write_main2
+    LOAD_8ROWS        r3+16*11, 16
+    call .write_main
+    LOAD_8ROWS        r3+16*19, 16
+    call .write_main
+    LOAD_8ROWS        r3+16*27, 16
+.write_main:
+    mova             [r3+16*0], m7
+.write_main2:
+    mova                    m7, [o(pw_2048)]
+    REPX      {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+    pmulhrsw                m7, [r3+16*0]
+    mova             [r3+16*2], m5
+    mova             [r3+16*1], m6
+    mova             [r3+16*0], m7
+    WRITE_8X4                0, 1, 2, 3, 5, 6, 7
+    lea                   dstq, [dstq+strideq*2]
+    WRITE_8X4                4, [r3+16*2], [r3+16*1], [r3+16*0], 5, 6, 7
+    lea                   dstq, [dstq+strideq*2]
+    ret
 
 
 ALIGN function_align
@@ -5765,7 +5758,7 @@ cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 6, 8, 16*132, dst, stride, coeff, eo
     movd                    m2, [o(pw_8192)]
     mov               [coeffq], eobd
     mov                    r3d, 16
-    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_64x16_8bpc).end)]
+    lea                   tx2q, [o(.end)]
 
 .body:
     pmulhrsw                m0, m2
@@ -5895,7 +5888,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end)]
+    lea                   tx2q, [o(.pass1_end)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end:
@@ -5903,7 +5896,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*11, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end1)]
+    lea                   tx2q, [o(.pass1_end1)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end1:
@@ -5911,7 +5904,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*19, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end2)]
+    lea                   tx2q, [o(.pass1_end2)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end2:
@@ -5919,7 +5912,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*27, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end3)]
+    lea                   tx2q, [o(.pass1_end3)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end3:
@@ -5927,7 +5920,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*35, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end4)]
+    lea                   tx2q, [o(.pass1_end4)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end4:
@@ -5935,7 +5928,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*43, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end5)]
+    lea                   tx2q, [o(.pass1_end5)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end5:
@@ -5943,7 +5936,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*51, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end6)]
+    lea                   tx2q, [o(.pass1_end6)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end6:
@@ -5951,7 +5944,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*59, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end7)]
+    lea                   tx2q, [o(.pass1_end7)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end7:
@@ -5979,14 +5972,14 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     call m(idct_16x8_internal_8bpc).main
 
     mov                    r3, dstq
-    lea                  tx2q, [o(m(idct_64x16_internal_8bpc).end)]
+    lea                  tx2q, [o(.end)]
     lea                  dstq, [dstq+strideq*8]
     jmp  m(idct_8x8_internal_8bpc).end
 
 .end:
     LOAD_8ROWS   rsp+gprsize+16*3, 16
     mova   [rsp+gprsize+16*0], m7
-    lea                  tx2q, [o(m(idct_64x16_internal_8bpc).end1)]
+    lea                  tx2q, [o(.end1)]
     mov                  dstq, r3
     jmp  m(idct_8x8_internal_8bpc).end
 
@@ -6016,14 +6009,14 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     call m(idct_16x8_internal_8bpc).main
 
     mov                    r3, dstq
-    lea                  tx2q, [o(m(idct_64x16_internal_8bpc).end2)]
+    lea                  tx2q, [o(.end2)]
     lea                  dstq, [dstq+strideq*8]
     jmp  m(idct_8x8_internal_8bpc).end
 
 .end2:
     LOAD_8ROWS   rsp+gprsize+16*3, 16
     mova   [rsp+gprsize+16*0], m7
-    lea                  tx2q, [o(m(idct_64x16_internal_8bpc).end3)]
+    lea                  tx2q, [o(.end3)]
     mov                  dstq, r3
     jmp  m(idct_8x8_internal_8bpc).end
 
@@ -6045,8 +6038,8 @@ cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob
 %endif
     test                  eobd, eobd
     jz .dconly
-
     call m(idct_32x64_internal_8bpc)
+.end:
     RET
 
 .dconly:
@@ -6056,16 +6049,11 @@ cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob
     mov               [coeffq], eobd
     pmulhrsw                m0, m1
     mov                    r3d, 64
-    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_32x64_8bpc).end)]
+    lea                   tx2q, [o(.end)]
     jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body
 
-.end:
-    RET
-
 
 cglobal idct_32x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
-    %undef cmp
-
     mov                    r4d, 2
     sub                   eobd, 136
     mov  [rsp+gprsize*1+16*67], eobd
@@ -6133,28 +6121,28 @@ cglobal idct_32x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
 
 .pass1_end:
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end1)]
+    lea                   tx2q, [o(.pass1_end1)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end1:
     SAVE_8ROWS     coeffq+64*0, 64
     LOAD_8ROWS   rsp+gprsize+16*11, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end2)]
+    lea                   tx2q, [o(.pass1_end2)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end2:
     SAVE_8ROWS     coeffq+64*8, 64
     LOAD_8ROWS   rsp+gprsize+16*19, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end3)]
+    lea                   tx2q, [o(.pass1_end3)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end3:
     SAVE_8ROWS    coeffq+64*16, 64
     LOAD_8ROWS   rsp+gprsize+16*27, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end4)]
+    lea                   tx2q, [o(.pass1_end4)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end4:
@@ -6179,8 +6167,8 @@ cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eo
 %endif
     test                  eobd, eobd
     jz .dconly
-
     call m(idct_64x32_internal_8bpc)
+.end:
     RET
 
 .dconly:
@@ -6190,15 +6178,11 @@ cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eo
     pmulhrsw                m0, m1
     mov               [coeffq], eobd
     mov                    r3d, 32
-    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_64x32_8bpc).end)]
+    lea                   tx2q, [o(.end)]
     jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body
 
-.end:
-    RET
 
 cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
-    %undef cmp
-
     mov                    r4d, 2
     sub                   eobd, 136
     mov  [rsp+gprsize*1+16*67], eobd
@@ -6266,56 +6250,56 @@ cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
 
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end)]
+    lea                   tx2q, [o(.pass1_end)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end:
     SAVE_8ROWS     coeffq+64*0, 64
     LOAD_8ROWS   rsp+gprsize+16*11, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end1)]
+    lea                   tx2q, [o(.pass1_end1)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end1:
     SAVE_8ROWS     coeffq+64*8, 64
     LOAD_8ROWS   rsp+gprsize+16*19, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end2)]
+    lea                   tx2q, [o(.pass1_end2)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end2:
     SAVE_8ROWS    coeffq+64*16, 64
     LOAD_8ROWS   rsp+gprsize+16*27, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end3)]
+    lea                   tx2q, [o(.pass1_end3)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end3:
     SAVE_8ROWS    coeffq+64*24, 64
     LOAD_8ROWS   rsp+gprsize+16*35, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end4)]
+    lea                   tx2q, [o(.pass1_end4)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end4:
     SAVE_8ROWS       dstq+64*0, 64
     LOAD_8ROWS   rsp+gprsize+16*43, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end5)]
+    lea                   tx2q, [o(.pass1_end5)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end5:
     SAVE_8ROWS       dstq+64*8, 64
     LOAD_8ROWS   rsp+gprsize+16*51, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end6)]
+    lea                   tx2q, [o(.pass1_end6)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end6:
     SAVE_8ROWS      dstq+64*16, 64
     LOAD_8ROWS   rsp+gprsize+16*59, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end7)]
+    lea                   tx2q, [o(.pass1_end7)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end7:
@@ -6332,17 +6316,17 @@ cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     mov                   eobd, [rsp+gprsize*1+16*67]
     lea                   dstq, [dstq+32]
     mov  [rsp+gprsize*1+16*35], eobd
-    lea                   tx2q, [o(m(idct_64x32_internal_8bpc).pass2_end)]
+    lea                   tx2q, [o(.pass2_end)]
     mov                    r3d, 4
     jmp m(idct_32x32_internal_8bpc).pass2_loop
 
 .pass2_end:
     mova    [rsp+gprsize+16*0], m7
-    lea                     r3, [o(m(idct_64x32_internal_8bpc).pass2_end1)]
+    lea                     r3, [o(.pass2_end1)]
     jmp  m(idct_8x32_internal_8bpc).end2
 
 .pass2_end1:
-    lea                   tx2q, [o(m(idct_64x32_internal_8bpc).pass2_end)]
+    lea                   tx2q, [o(.pass2_end)]
     add                 coeffq, 16*32
     mov                   dstq, [rsp+gprsize*2+16*35]
     mov                    r3d, [rsp+gprsize*3+16*35]
@@ -6377,8 +6361,6 @@ cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eo
     jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body
 
 cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
-    %undef cmp
-
     mov                    r5d, 4
     mov                    r4d, 2
     sub                   eobd, 136
@@ -6448,7 +6430,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end)]
+    lea                   tx2q, [o(.pass1_end)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end:
@@ -6456,7 +6438,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*11, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end1)]
+    lea                   tx2q, [o(.pass1_end1)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end1:
@@ -6464,7 +6446,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*19, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end2)]
+    lea                   tx2q, [o(.pass1_end2)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end2:
@@ -6472,7 +6454,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*27, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end3)]
+    lea                   tx2q, [o(.pass1_end3)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end3:
@@ -6480,7 +6462,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*35, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end4)]
+    lea                   tx2q, [o(.pass1_end4)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end4:
@@ -6488,7 +6470,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*43, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end5)]
+    lea                   tx2q, [o(.pass1_end5)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end5:
@@ -6496,7 +6478,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*51, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end6)]
+    lea                   tx2q, [o(.pass1_end6)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end6:
@@ -6504,7 +6486,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*59, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end7)]
+    lea                   tx2q, [o(.pass1_end7)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end7:
@@ -6522,26 +6504,20 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     mov                    r3d, 4
     lea                     r4, [dstq+8]
     mov  [rsp+gprsize*2+16*67], r4
-    lea                     r4, [o(m(idct_64x64_internal_8bpc).pass2_end)]
+    lea                     r4, [o(.pass2_end)]
     jmp m(idct_16x64_internal_8bpc).pass2_loop
 
 .pass2_end:
     LOAD_8ROWS   rsp+gprsize+16*35, 16
     lea                   dstq, [dstq+strideq*2]
-    add                    rsp, 16*32
+    lea                     r3, [rsp+16*32+gprsize]
     mova    [rsp+gprsize+16*0], m7
-    lea                     r3, [o(m(idct_64x64_internal_8bpc).pass2_end1)]
-    jmp  m(idct_8x32_internal_8bpc).end2
-
-.pass2_end1:
-    add                 coeffq, 16*32
-    sub                    rsp, 16*32
-
+    call m(idct_16x64_internal_8bpc).write
     mov                   dstq, [rsp+gprsize*2+16*67]
     mov                    r3d, [rsp+gprsize*3+16*67]
     lea                     r4, [dstq+8]
     mov  [rsp+gprsize*2+16*67], r4
-    lea                     r4, [o(m(idct_64x64_internal_8bpc).pass2_end)]
+    lea                     r4, [o(.pass2_end)]
 
     dec                    r3d
     jg  m(idct_16x64_internal_8bpc).pass2_loop
diff --git a/third_party/dav1d/src/x86/loopfilter16_avx2.asm b/third_party/dav1d/src/x86/loopfilter16_avx2.asm
index 0c8618655c46d..361ccc3b883ce 100644
--- a/third_party/dav1d/src/x86/loopfilter16_avx2.asm
+++ b/third_party/dav1d/src/x86/loopfilter16_avx2.asm
@@ -49,14 +49,6 @@ pb_mask: dd 1, 1, 2, 2, 4, 4, 8, 8
 
 SECTION .text
 
-%macro REPX 2-*
-    %xdefine %%f(x) %1
-%rep %0 - 1
-    %rotate 1
-    %%f(%1)
-%endrep
-%endmacro
-
 ;        in:            out:
 ; mm%1   a b c d        a e i m
 ; mm%2   e f g h        b f j n
diff --git a/third_party/dav1d/src/x86/loopfilter16_sse.asm b/third_party/dav1d/src/x86/loopfilter16_sse.asm
index 3ec3fd81fe3f0..c486b57a2113a 100644
--- a/third_party/dav1d/src/x86/loopfilter16_sse.asm
+++ b/third_party/dav1d/src/x86/loopfilter16_sse.asm
@@ -106,14 +106,6 @@ ASSERT ARCH_X86_32
 %endif
 %endmacro
 
-%macro REPX 2-*
-    %xdefine %%f(x) %1
-%rep %0 - 1
-    %rotate 1
-    %%f(%1)
-%endrep
-%endmacro
-
 %macro SPLATD 2
     movd %1, %2
     pshufd %1, %1, q0000
diff --git a/third_party/dav1d/src/x86/looprestoration16_avx2.asm b/third_party/dav1d/src/x86/looprestoration16_avx2.asm
index 98f51d8f1e5a7..ef25c28474416 100644
--- a/third_party/dav1d/src/x86/looprestoration16_avx2.asm
+++ b/third_party/dav1d/src/x86/looprestoration16_avx2.asm
@@ -66,14 +66,6 @@ cextern sgr_x_by_x_avx2
 
 SECTION .text
 
-%macro REPX 2-*
-    %xdefine %%f(x) %1
-%rep %0 - 1
-    %rotate 1
-    %%f(%1)
-%endrep
-%endmacro
-
 DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; wiener ring buffer pointers
 
 INIT_YMM avx2
diff --git a/third_party/dav1d/src/x86/mc16_avx2.asm b/third_party/dav1d/src/x86/mc16_avx2.asm
index 3dacfe66a6cf7..8b2ec4fa91fa7 100644
--- a/third_party/dav1d/src/x86/mc16_avx2.asm
+++ b/third_party/dav1d/src/x86/mc16_avx2.asm
@@ -202,14 +202,6 @@ cextern resize_filter
 
 SECTION .text
 
-%macro REPX 2-*
-    %xdefine %%f(x) %1
-%rep %0 - 1
-    %rotate 1
-    %%f({%1})
-%endrep
-%endmacro
-
 INIT_XMM avx2
 cglobal put_bilin_16bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
     mov                mxyd, r6m ; mx
diff --git a/third_party/dav1d/src/x86/mc16_avx512.asm b/third_party/dav1d/src/x86/mc16_avx512.asm
index c2ea090b0c5ba..e83b18ad969cb 100644
--- a/third_party/dav1d/src/x86/mc16_avx512.asm
+++ b/third_party/dav1d/src/x86/mc16_avx512.asm
@@ -254,14 +254,6 @@ cextern resize_filter
 
 SECTION .text
 
-%macro REPX 2-*
-    %xdefine %%f(x) %1
-%rep %0 - 1
-    %rotate 1
-    %%f(%1)
-%endrep
-%endmacro
-
 %if WIN64
 DECLARE_REG_TMP 4
 %else
diff --git a/third_party/dav1d/src/x86/mc16_sse.asm b/third_party/dav1d/src/x86/mc16_sse.asm
index 6435bd083cb18..fde8e372a3fed 100644
--- a/third_party/dav1d/src/x86/mc16_sse.asm
+++ b/third_party/dav1d/src/x86/mc16_sse.asm
@@ -166,14 +166,6 @@ cextern resize_filter
 
 SECTION .text
 
-%macro REPX 2-*
-    %xdefine %%f(x) %1
-%rep %0 - 1
-    %rotate 1
-    %%f(%1)
-%endrep
-%endmacro
-
 %if UNIX64
 DECLARE_REG_TMP 7
 %else
@@ -4799,9 +4791,7 @@ INIT_XMM ssse3
     psrad                m6, hsh_mem
     packssdw            m11, m6                     ; 7 8
 %if ARCH_X86_64
-    ; fixme a bug in x86inc.asm forces us to explicitly load m9
-    mova                 m9, [stk+0x40]
-    shufps               m9, m11, q1032 ; 6 7
+    shufps               m9, [stk+0x40], m11, q1032 ; 6 7
     mova                 m0, [stk+0x00]
     mova         [stk+0x40], m11
 %else
diff --git a/third_party/dav1d/src/x86/mc_avx512.asm b/third_party/dav1d/src/x86/mc_avx512.asm
index fb55449f335a8..eb3ca1c427da5 100644
--- a/third_party/dav1d/src/x86/mc_avx512.asm
+++ b/third_party/dav1d/src/x86/mc_avx512.asm
@@ -303,14 +303,6 @@ BIDIR_JMP_TABLE blend_h, avx512icl,     2, 4, 8, 16, 32, 64, 128
 
 SECTION .text
 
-%macro REPX 2-*
-    %xdefine %%f(x) %1
-%rep %0 - 1
-    %rotate 1
-    %%f(%1)
-%endrep
-%endmacro
-
 %macro WRAP_YMM 1+
 INIT_YMM cpuname
     %1
diff --git a/third_party/dav1d/tests/checkasm/filmgrain.c b/third_party/dav1d/tests/checkasm/filmgrain.c
index a44a3ac422cb3..ff7ffc36c68a6 100644
--- a/third_party/dav1d/tests/checkasm/filmgrain.c
+++ b/third_party/dav1d/tests/checkasm/filmgrain.c
@@ -30,7 +30,7 @@
 #include <string.h>
 
 #include "src/levels.h"
-#include "src/film_grain.h"
+#include "src/filmgrain.h"
 #define UNIT_TEST 1
 #include "src/fg_apply_tmpl.c"
 
@@ -155,6 +155,7 @@ static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
 
     if (check_func(dsp->fgy_32x32xn, "fgy_32x32xn_%dbpc", BITDEPTH)) {
         ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 16,);
+        ALIGN_STK_64(uint8_t, scaling, SCALING_SIZE,);
         fg_data[0].seed = rnd() & 0xFFFF;
 
 #if BITDEPTH == 16
@@ -163,7 +164,6 @@ static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
         const int bitdepth_max = 0xff;
 #endif
 
-        uint8_t scaling[SCALING_SIZE];
         entry grain_lut[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
         fg_data[0].grain_scale_shift = rnd() & 3;
         fg_data[0].ar_coeff_shift = (rnd() & 3) + 6;
@@ -267,6 +267,7 @@ static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
                            BITDEPTH, ss_name[layout_idx], csfl))
             {
                 ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 1,);
+                ALIGN_STK_64(uint8_t, scaling, SCALING_SIZE,);
 
                 fg_data[0].seed = rnd() & 0xFFFF;
 
@@ -278,7 +279,6 @@ static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
                 const int uv_pl = rnd() & 1;
                 const int is_identity = rnd() & 1;
 
-                uint8_t scaling[SCALING_SIZE];
                 entry grain_lut[2][GRAIN_HEIGHT + 1][GRAIN_WIDTH];
                 fg_data[0].grain_scale_shift = rnd() & 3;
                 fg_data[0].ar_coeff_shift = (rnd() & 3) + 6;
@@ -368,7 +368,7 @@ static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
                         checkasm_check_pixel_padded_align(c_dst, stride,
                                                           a_dst, stride,
                                                           w, h, "dst",
-                                                          32 >> ss_x, 2);
+                                                          32 >> ss_x, 4);
                     }
                 }
 
@@ -380,7 +380,7 @@ static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
                         luma_src[y * PXSTRIDE(lstride) + x] &= bitdepth_max;
                     }
                 }
-                bench_new(a_dst, src, stride, fg_data, 32, scaling, grain_lut[1], 16,
+                bench_new(a_dst, src, stride, fg_data, 64 >> ss_x, scaling, grain_lut[1], 32 >> ss_y,
                           1, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
             }
         }

-- 
To stop receiving notification emails like this one, please contact
the administrator of this repository.
_______________________________________________
tor-commits mailing list
tor-commits@xxxxxxxxxxxxxxxxxxxx
https://lists.torproject.org/cgi-bin/mailman/listinfo/tor-commits