--reset

--dtag=undef,abx,aBx16b
--sdt=f32,bf16
--ddt=f32,bf16
--stag=abx:abx
--axis=3 3x4x5x13:3x4x5x17
--axis=2 5x2x0x8:5x2x8x8
--axis=0 7x8x4x9:9x8x4x9
--axis=1 4x16x2x1:4x8x2x1

--stag=abx:abx:abx
--axis=1
6x48x3x4x5:6x32x3x4x5:6x16x3x4x5

# sizes less than blocks (+zero dim)
--stag=aBx16b:abx:aBx8b:axb
--axis=1
6x8x3x4:6x2x3x4:6x3x3x4:6x0x3x4
6x8x3x4:6x1x3x4:6x0x3x4:6x3x3x4
6x0x3x4:6x3x3x4:6x5x3x4:6x5x3x4

# f16
--sdt=f16
--ddt=f16
--stag=abx:abx
--axis=3 3x4x5x13:3x4x5x17
--axis=2 5x2x0x8:5x2x8x8
--axis=0 7x8x4x9:9x8x4x9
--axis=1 4x16x2x1:4x8x2x1

--stag=abx:abx:abx
--axis=1
6x48x3x4x5:6x32x3x4x5:6x16x3x4x5

# sizes less than blocks (+zero dim)
--stag=aBx16b:abx:aBx8b:axb
--axis=1
6x8x3x4:6x2x3x4:6x3x3x4:6x0x3x4
6x8x3x4:6x1x3x4:6x0x3x4:6x3x3x4
6x0x3x4:6x3x3x4:6x5x3x4:6x5x3x4

# stag different from dtag
--stag=abx:abx
--dtag=axb
--axis=1 2x16x3x4:2x16x3x4

# Test CI in Nightly
--reset
--batch=test_concat_ci

# Test layers of some key and ext GPU DL Frameworks
--reset
--batch=option_set_fwks_key_gpu
--reset
--batch=option_set_fwks_ext_gpu

# additional tests for xe concat kernel
--reset
--batch=option_set_xe_gpu

# multiple arguments tests
--reset
--axis=1
--stag=ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab
--dtag=ab
512x128:512x128:512x128:512x128:512x128:512x128:512x128:512x128:512x128:512x128:512x128:512x128:512x128:512x128:512x128:512x128:512x128:512x128:512x128:512x128:512x128:512x128:512x128:512x128:512x128:512x128:512x128:512x128:512x128:512x128

--stag=aBx16b:aBx16b:aBx16b:aBx16b:aBx16b:aBx16b:aBx16b:aBx16b:aBx16b:aBx16b:aBx16b:aBx16b
--dtag=aBx16b
10x32x16:10x1x16:10x64x16:10x9x16:10x11x16:10x128x16:10x7x16:10x32x16:10x32x16:10x32x16:10x32x16:10x32x16

# regression tests
--reset
--sdt=f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd --axis=3 32x8x8x1:32x8x8x1
--reset
--sdt=f16 --ddt=f16 --stag=aBx16b:aBx16b --dtag=aBx16b 1x32x6x6:1x32x6x6
--reset
--stag=abx:abx 256x1x8x8x3x3:256x1x8x8x3x3

# Only the first of these should dispatch to simple concat
--reset
--allow-enum-tags-only=0
--stag=abcd:acbd:acbd --dtag=abcd --axis=1
--attr-scales=,msrc0:common:1.5,msrc0:common:1.5+msrc1:common:2.5
4x1x1x4:4x1x1x4:4x1x1x4_n"simple_concat_ok"
4x2x1x4:4x2x1x4:4x2x1x4
4x1x2x4:4x1x2x4:4x1x2x4

--stag=abcde:abcde --dtag=abdce --axis=3 32x6x26x1x9:32x6x26x1x9
--stag=abc:acb:acb --dtag=abc --axis=1 4x1x4:4x1x4:4x1x4

# many inputs with scales
--reset
--sdt=u8 --ddt=u8
--stag=nchw:nchw:nchw:nchw:nchw:nchw:nchw:nchw:nchw:nchw:nchw:nchw:nchw:nchw:nchw:nchw:nchw
--dtag=nchw
--attr-scales=msrc0:common:2+msrc1:common:3+msrc2:common:4+msrc3:common:5+msrc4:common:6+msrc5:common:7+msrc6:common:8+msrc7:common:9+msrc8:common:10+msrc9:common:11+msrc10:common:12+msrc11:common:13+msrc12:common:14+msrc13:common:15+msrc14:common:16+msrc15:common:17+msrc16:common:18
1x1x40x40:1x1x40x40:1x1x40x40:1x1x40x40:1x1x40x40:1x1x40x40:1x1x40x40:1x1x40x40:1x1x40x40:1x1x40x40:1x1x40x40:1x1x40x40:1x1x40x40:1x1x40x40:1x1x40x40:1x1x40x40:1x1x40x40

# additional tests for internal padding concat kernel
--reset
--dtag=aBc32b,aBc16b,aBc8b,aBc4b
--sdt=f32,f16,bf16,s8
--ddt=f32,f16,bf16,s8
--stag=aBc32b:aBc32b,aBc16b:aBc16b,aBc8b:aBc8b,aBc4b:aBc4b
--axis=1
4x33x17:4x32x17
4x34x17:4x32x17
4x129x17:4x129x17
4x33x16:4x32x16
4x33x4096:4x32x4096
4x129x16:4x129x16
4x129x4096:4x129x4096

# tests w/small internal padding concat
--reset
--dtag=aBc4b
--stag=aBc4b:aBc4b
--axis=1
# src0 < elems_per_sg
4x2x8:4x2x8
# sg_size > boundary (no trailing blocks)
4x34x8:4x4x8
# boundary_blocks only
4x2x32:4x2x32

# large internal padding concat for idx_t=ulong
--reset
--dtag=aBc32b
--stag=aBc32b:aBc32b
--axis=1
--sdt=bf16 --ddt=bf16 4x337810x992:4x17x992

# test varying internal padding tags/formats
--reset
--sdt=u8
--ddt=u8
--axis=1
--stag=aBcd32b:aBcd32b
--dtag=aBcd32b
4x33x8x8:4x32x8x8

--stag=ABcd32a32b:ABcd32a32b
--dtag=ABcd32a32b
4x33x8x8:4x32x8x8
