blur3x3_hvx128

module name=blur3x3_hvx128, target=arm-64-osx {
func blur3x3_hvx128(input, blur_y)
{
Loop Depth 0
Computation Cost 0%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
assert((uint64(input.buffer) != (uint64)0), halide_error_buffer_argument_is_null("input"))
Loop Depth 0
Computation Cost 0%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
assert((uint64(blur_y.buffer) != (uint64)0), halide_error_buffer_argument_is_null("blur_y"))

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 36%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
let blur_y = _halide_buffer_get_host(blur_y.buffer)

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 36%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
let blur_y.type = _halide_buffer_get_type(blur_y.buffer)

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 35%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
let blur_y.device_dirty = _halide_buffer_get_device_dirty(blur_y.buffer)

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 35%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
let blur_y.dimensions = _halide_buffer_get_dimensions(blur_y.buffer)

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 35%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
let blur_y.min.0 = _halide_buffer_get_min(blur_y.buffer, 0)

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 35%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
let blur_y.extent.0 = _halide_buffer_get_extent(blur_y.buffer, 0)

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 34%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
let blur_y.stride.0 = _halide_buffer_get_stride(blur_y.buffer, 0)

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 34%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
let blur_y.min.1 = _halide_buffer_get_min(blur_y.buffer, 1)

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 34%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
let blur_y.extent.1 = _halide_buffer_get_extent(blur_y.buffer, 1)

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 33%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
let blur_y.stride.1 = _halide_buffer_get_stride(blur_y.buffer, 1)

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 33%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
let input = _halide_buffer_get_host(input.buffer)

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 33%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
let input.type = _halide_buffer_get_type(input.buffer)

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 33%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
let input.device_dirty = _halide_buffer_get_device_dirty(input.buffer)

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 32%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
let input.dimensions = _halide_buffer_get_dimensions(input.buffer)

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 32%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
let input.min.0 = _halide_buffer_get_min(input.buffer, 0)

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 32%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
let input.extent.0 = _halide_buffer_get_extent(input.buffer, 0)

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 31%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
let input.stride.0 = _halide_buffer_get_stride(input.buffer, 0)

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 31%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
let input.min.1 = _halide_buffer_get_min(input.buffer, 1)

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 31%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
let input.extent.1 = _halide_buffer_get_extent(input.buffer, 1)

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 31%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
let input.stride.1 = _halide_buffer_get_stride(input.buffer, 1)

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 1%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
if (_halide_buffer_is_bounds_query(blur_y.buffer))
{
Loop Depth 0
Computation Cost 1%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
_halide_buffer_init(blur_y.buffer, _halide_buffer_get_shape(blur_y.buffer), (void *)((uint64)0), (uint64)0, (halide_device_interface_t *)((uint64)0), 1, 16, 2, make_struct(0, 1920, 1, 0, 0, 1080, 1920, 0), (uint64)0)
}
Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 1%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
if (_halide_buffer_is_bounds_query(input.buffer))
{
Loop Depth 0
Computation Cost 1%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
_halide_buffer_init(input.buffer, _halide_buffer_get_shape(input.buffer), (void *)((uint64)0), (uint64)0, (halide_device_interface_t *)((uint64)0), 1, 16, 2, make_struct(0, 1922, 1, 0, 0, 1082, 1922, 0), (uint64)0)
}
Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 26%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
if (!(_halide_buffer_is_bounds_query(blur_y.buffer) || _halide_buffer_is_bounds_query(input.buffer)))
{
Loop Depth 0
Computation Cost 0%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
assert((blur_y.type == (uint32)69633), halide_error_bad_type("Output buffer blur_y", blur_y.type, (uint32)69633))
Loop Depth 0
Computation Cost 0%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
assert((blur_y.dimensions == 2), halide_error_bad_dimensions("Output buffer blur_y", blur_y.dimensions, 2))
Loop Depth 0
Computation Cost 0%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
assert((input.type == (uint32)69633), halide_error_bad_type("Input buffer input", input.type, (uint32)69633))
Loop Depth 0
Computation Cost 0%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
assert((input.dimensions == 2), halide_error_bad_dimensions("Input buffer input", input.dimensions, 2))
Loop Depth 0
Computation Cost 1%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
assert(((blur_y.min.0 <= 0) && (1920 <= (blur_y.extent.0 + blur_y.min.0))), halide_error_access_out_of_bounds("Output buffer blur_y", 0, 0, 1919, blur_y.min.0, ((blur_y.extent.0 + blur_y.min.0) + -1)))
Loop Depth 0
Computation Cost 0%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
assert((0 <= blur_y.extent.0), halide_error_buffer_extents_negative("Output buffer blur_y", 0, blur_y.extent.0))
Loop Depth 0
Computation Cost 1%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
assert(((blur_y.min.1 <= 0) && (1080 <= (blur_y.extent.1 + blur_y.min.1))), halide_error_access_out_of_bounds("Output buffer blur_y", 1, 0, 1079, blur_y.min.1, ((blur_y.extent.1 + blur_y.min.1) + -1)))
Loop Depth 0
Computation Cost 0%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
assert((0 <= blur_y.extent.1), halide_error_buffer_extents_negative("Output buffer blur_y", 1, blur_y.extent.1))
Loop Depth 0
Computation Cost 1%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
assert(((input.min.0 <= 0) && (1922 <= (input.extent.0 + input.min.0))), halide_error_access_out_of_bounds("Input buffer input", 0, 0, 1921, input.min.0, ((input.extent.0 + input.min.0) + -1)))
Loop Depth 0
Computation Cost 0%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
assert((0 <= input.extent.0), halide_error_buffer_extents_negative("Input buffer input", 0, input.extent.0))
Loop Depth 0
Computation Cost 1%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
assert(((input.min.1 <= 0) && (1082 <= (input.extent.1 + input.min.1))), halide_error_access_out_of_bounds("Input buffer input", 1, 0, 1081, input.min.1, ((input.extent.1 + input.min.1) + -1)))
Loop Depth 0
Computation Cost 0%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
assert((0 <= input.extent.1), halide_error_buffer_extents_negative("Input buffer input", 1, input.extent.1))
Loop Depth 0
Computation Cost 0%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
assert((blur_y.stride.0 == 1), halide_error_constraint_violated("blur_y.stride.0", blur_y.stride.0, "1", 1))
Loop Depth 0
Computation Cost 0%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
assert((blur_y.min.0 == 0), halide_error_constraint_violated("blur_y.min.0", blur_y.min.0, "0", 0))
Loop Depth 0
Computation Cost 0%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
assert((blur_y.extent.0 == 1920), halide_error_constraint_violated("blur_y.extent.0", blur_y.extent.0, "1920", 1920))
Loop Depth 0
Computation Cost 0%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
assert((blur_y.min.1 == 0), halide_error_constraint_violated("blur_y.min.1", blur_y.min.1, "0", 0))
Loop Depth 0
Computation Cost 0%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
assert((blur_y.extent.1 == 1080), halide_error_constraint_violated("blur_y.extent.1", blur_y.extent.1, "1080", 1080))
Loop Depth 0
Computation Cost 0%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
assert((input.stride.0 == 1), halide_error_constraint_violated("input.stride.0", input.stride.0, "1", 1))

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 9%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
let input.total_extent.1 = (int64(input.extent.1) * int64(input.extent.0))

Loop Depth 0
Computation Cost 1%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
assert((abs((int64(blur_y.stride.1) * (int64)1080)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("blur_y", abs((int64(blur_y.stride.1) * (int64)1080)), (uint64)2147483647))
Loop Depth 0
Computation Cost 0%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
assert((uint64(input.extent.0) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("input", uint64(input.extent.0), (uint64)2147483647))
Loop Depth 0
Computation Cost 1%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
assert((abs((int64(input.extent.1) * int64(input.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("input", abs((int64(input.extent.1) * int64(input.stride.1))), (uint64)2147483647))
Loop Depth 0
Computation Cost 0%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
assert((input.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("input", input.total_extent.1, (int64)2147483647))
Loop Depth 0
Computation Cost 0%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
assert(!blur_y.device_dirty, halide_error_device_dirty_with_no_device_support("Output buffer blur_y"))
Loop Depth 0
Computation Cost 0%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
assert(!input.device_dirty, halide_error_device_dirty_with_no_device_support("Input buffer input"))
Loop Depth 0
Computation Cost 0%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
assert((blur_y != (void *)((uint64)0)), halide_error_host_is_null("Output buffer blur_y"))
Loop Depth 0
Computation Cost 0%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
assert((input != (void *)((uint64)0)), halide_error_host_is_null("Input buffer input"))
Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 2%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
produce blur_y
{
Click to see assembly code

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 2%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
let t54 = (((2 - input.min.1) * input.stride.1) - input.min.0)

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 1%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
let parallel_closure = make_struct(blur_y, input, blur_y.stride.1, input.stride.1, t54)

Loop Depth 0
Computation Cost 0%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
let closure_result = halide_do_par_for(::blur3x3_hvx128_par_for_blur_y_s0_y_y, 0, 34, (uint8_t *)(parallel_closure))

Loop Depth 0
Computation Cost 0%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
assert((closure_result == 0), closure_result)
}
}
}
func blur3x3_hvx128_par_for_blur_y_s0_y_y(__user_context, blur_y.s0.y.y, closure_arg)
{

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 62%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 100%
[Click to see full hierarchy]
let closure_prototype = make_struct((void *)((uint64)0), (void *)((uint64)0), 0, 0, 0)

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 61%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 100%
[Click to see full hierarchy]
let blur_y = load_typed_struct_member(closure_arg, closure_prototype, 0)

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 61%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 100%
[Click to see full hierarchy]
let input = load_typed_struct_member(closure_arg, closure_prototype, 1)

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 61%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 100%
[Click to see full hierarchy]
let blur_y.stride.1 = load_typed_struct_member(closure_arg, closure_prototype, 2)

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 60%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 100%
[Click to see full hierarchy]
let input.stride.1 = load_typed_struct_member(closure_arg, closure_prototype, 3)

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 60%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 100%
[Click to see full hierarchy]
let t54 = load_typed_struct_member(closure_arg, closure_prototype, 4)

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 36%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 60%
[Click to see full hierarchy]
if ((blur_y.s0.y.y < 33))
{
Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 36%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 60%
[Click to see full hierarchy]
allocate blur_x[uint16 * 1920 * 4]

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 10%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 20%
[Click to see full hierarchy]
let t55 = (blur_y.s0.y.y * 32)

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 10%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 20%
[Click to see full hierarchy]
for (blur_y.s0.y.yi.$n.rebased, 0, 2)
{
Click to see assembly code

Loop Depth 1
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 9%
[Click to see full hierarchy]
Loop Depth 1
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 20%
[Click to see full hierarchy]
let t57 = (blur_y.s0.y.yi.$n.rebased * 120)

Loop Depth 1
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 9%
[Click to see full hierarchy]
Loop Depth 1
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 20%
[Click to see full hierarchy]
let t56 = ((((blur_y.s0.y.yi.$n.rebased + t55) + -2) * input.stride.1) + t54)

Loop Depth 1
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 8%
[Click to see full hierarchy]
Loop Depth 1
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 20%
[Click to see full hierarchy]
for (blur_y.s0.x.x, 0, 120)
{
Click to see assembly code
Loop Depth 2
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 8%
[Click to see full hierarchy]
Loop Depth 2
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 20%
[Click to see full hierarchy]
produce blur_x
{

Loop Depth 2
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 8%
[Click to see full hierarchy]
Loop Depth 2
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 20%
[Click to see full hierarchy]
let t48 = ((blur_y.s0.x.x * 16) + t56)

Loop Depth 2
Computation Cost 7%
[Click to see full hierarchy]
Loop Depth 2
Data Movement Cost 20%
[Click to see full hierarchy]
blur_x[ramp(((blur_y.s0.x.x + t57) * 16), 1, 16)] = ((input[ramp((t48 + 1), 1, 16)] + (input[ramp((t48 + 2), 1, 16)] + input[ramp(t48, 1, 16)])) / x16((uint16)3))
}
}
}

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 25%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 40%
[Click to see full hierarchy]
let t58 = (blur_y.s0.y.y * 32)

Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 25%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 40%
[Click to see full hierarchy]
for (blur_y.s0.y.yi.$n, 0, 32)
{
Click to see assembly code

Loop Depth 1
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 24%
[Click to see full hierarchy]
Loop Depth 1
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 40%
[Click to see full hierarchy]
let t63 = (blur_y.s0.y.yi.$n + t58)

Loop Depth 1
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 24%
[Click to see full hierarchy]
Loop Depth 1
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 40%
[Click to see full hierarchy]
let t60 = (((t63 + 2) % 4) * 120)

Loop Depth 1
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 24%
[Click to see full hierarchy]
Loop Depth 1
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 40%
[Click to see full hierarchy]
let t62 = ((blur_y.s0.y.yi.$n % 4) * 120)

Loop Depth 1
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 23%
[Click to see full hierarchy]
Loop Depth 1
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 40%
[Click to see full hierarchy]
let t59 = ((input.stride.1 * t63) + t54)

Loop Depth 1
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 23%
[Click to see full hierarchy]
Loop Depth 1
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 40%
[Click to see full hierarchy]
for (blur_y.s0.x.x, 0, 120)
{
Click to see assembly code
Loop Depth 2
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 8%
[Click to see full hierarchy]
Loop Depth 2
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 20%
[Click to see full hierarchy]
produce blur_x
{

Loop Depth 2
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 8%
[Click to see full hierarchy]
Loop Depth 2
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 20%
[Click to see full hierarchy]
let t50 = ((blur_y.s0.x.x * 16) + t59)

Loop Depth 2
Computation Cost 7%
[Click to see full hierarchy]
Loop Depth 2
Data Movement Cost 20%
[Click to see full hierarchy]
blur_x[ramp(((blur_y.s0.x.x + t60) * 16), 1, 16)] = ((input[ramp((t50 + 1), 1, 16)] + (input[ramp((t50 + 2), 1, 16)] + input[ramp(t50, 1, 16)])) / x16((uint16)3))
}
Loop Depth 2
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 14%
[Click to see full hierarchy]
Loop Depth 2
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 20%
[Click to see full hierarchy]
consume blur_x
{
Loop Depth 2
Computation Cost 14%
[Click to see full hierarchy]
Loop Depth 2
Data Movement Cost 20%
[Click to see full hierarchy]
blur_y[ramp(((blur_y.stride.1 * t63) + (blur_y.s0.x.x * 16)), 1, 16)] = ((blur_x[ramp((((((t63 + 1) % 4) * 120) + blur_y.s0.x.x) * 16), 1, 16)] + (blur_x[ramp((((((t63 + 2) % 4) * 120) + blur_y.s0.x.x) * 16), 1, 16)] + blur_x[ramp(((blur_y.s0.x.x + t62) * 16), 1, 16)])) / x16((uint16)3))
}
}
}
Loop Depth 0
Computation Cost 0%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
free blur_x
}
Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 23%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 40%
[Click to see full hierarchy]
else
{
Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 23%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 40%
[Click to see full hierarchy]
allocate blur_x[uint16 * 1920 * 4]
Loop Depth 0
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 22%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 40%
[Click to see full hierarchy]
for (blur_y.s0.y.yi.$n.rebased, 0, 34)
{
Click to see assembly code

Loop Depth 1
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 22%
[Click to see full hierarchy]
Loop Depth 1
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 40%
[Click to see full hierarchy]
let t70 = (blur_y.s0.y.yi.$n.rebased + 1046)

Loop Depth 1
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 22%
[Click to see full hierarchy]
Loop Depth 1
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 40%
[Click to see full hierarchy]
let t66 = (2 <= blur_y.s0.y.yi.$n.rebased)

Loop Depth 1
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 21%
[Click to see full hierarchy]
Loop Depth 1
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 40%
[Click to see full hierarchy]
let t68 = (((blur_y.s0.y.yi.$n.rebased + 3) % 4) * 120)

Loop Depth 1
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 21%
[Click to see full hierarchy]
Loop Depth 1
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 40%
[Click to see full hierarchy]
let t67 = (((blur_y.s0.y.yi.$n.rebased + 2) % 4) * 120)

Loop Depth 1
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 20%
[Click to see full hierarchy]
Loop Depth 1
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 40%
[Click to see full hierarchy]
let t65 = ((blur_y.s0.y.yi.$n.rebased % 4) * 120)

Loop Depth 1
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 20%
[Click to see full hierarchy]
Loop Depth 1
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 40%
[Click to see full hierarchy]
let t69 = (blur_y.stride.1 * t70)

Loop Depth 1
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 19%
[Click to see full hierarchy]
Loop Depth 1
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 40%
[Click to see full hierarchy]
let t64 = ((input.stride.1 * t70) + t54)

Loop Depth 1
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 19%
[Click to see full hierarchy]
Loop Depth 1
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 40%
[Click to see full hierarchy]
for (blur_y.s0.x.x, 0, 120)
{
Click to see assembly code
Loop Depth 2
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 8%
[Click to see full hierarchy]
Loop Depth 2
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 20%
[Click to see full hierarchy]
produce blur_x
{

Loop Depth 2
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 8%
[Click to see full hierarchy]
Loop Depth 2
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 20%
[Click to see full hierarchy]
let t52 = ((blur_y.s0.x.x * 16) + t64)

Loop Depth 2
Computation Cost 7%
[Click to see full hierarchy]
Loop Depth 2
Data Movement Cost 20%
[Click to see full hierarchy]
blur_x[ramp(((blur_y.s0.x.x + t65) * 16), 1, 16)] = ((input[ramp((t52 + 1), 1, 16)] + (input[ramp((t52 + 2), 1, 16)] + input[ramp(t52, 1, 16)])) / x16((uint16)3))
}
Loop Depth 2
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 10%
[Click to see full hierarchy]
Loop Depth 2
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 20%
[Click to see full hierarchy]
if (t66)
{
Loop Depth 2
Computation Cost (Exclusive) 0%
Computation Cost (Inclusive) 10%
[Click to see full hierarchy]
Loop Depth 2
Data Movement Cost (Exclusive) 0%
Data Movement Cost (Inclusive) 20%
[Click to see full hierarchy]
consume blur_x
{
Click to see assembly code
Loop Depth 2
Computation Cost 10%
[Click to see full hierarchy]
Loop Depth 2
Data Movement Cost 20%
[Click to see full hierarchy]
blur_y[ramp(((blur_y.s0.x.x * 16) + t69), 1, 16)] = ((blur_x[ramp(((blur_y.s0.x.x + t68) * 16), 1, 16)] + (blur_x[ramp(((blur_y.s0.x.x + t65) * 16), 1, 16)] + blur_x[ramp(((blur_y.s0.x.x + t67) * 16), 1, 16)])) / x16((uint16)3))
}
}
}
}
Loop Depth 0
Computation Cost 0%
[Click to see full hierarchy]
Loop Depth 0
Data Movement Cost 0%
[Click to see full hierarchy]
free blur_x
}
}
}

Func: blur3x3_hvx128

  • If
    • Loop Depth 0
      Computation Cost (Exclusive) 0%
      Computation Cost (Inclusive) 1%
      Loop Depth 0
      Data Movement Cost 0%
      if (...condition:
      (uint1)_halide_buffer_is_bounds_query((halide_buffer_t *)blur_y.buffer)
      )
  • If
    • Loop Depth 0
      Computation Cost (Exclusive) 0%
      Computation Cost (Inclusive) 1%
      Loop Depth 0
      Data Movement Cost 0%
      if (...condition:
      (uint1)_halide_buffer_is_bounds_query((halide_buffer_t *)input.buffer)
      )
  • If
    • Loop Depth 0
      Computation Cost (Exclusive) 0%
      Computation Cost (Inclusive) 26%
      Loop Depth 0
      Data Movement Cost 0%
      if (...condition:
      !((uint1)_halide_buffer_is_bounds_query((halide_buffer_t *)blur_y.buffer) || (uint1)_halide_buffer_is_bounds_query((halide_buffer_t *)input.buffer))
      )
      Loop Depth 0
      Computation Cost (Exclusive) 0%
      Computation Cost (Inclusive) 2%
      Loop Depth 0
      Data Movement Cost 0%
      Produce blur_y
      Function Call

Func: blur3x3_hvx128_par_for_blur_y_s0_y_y

  • If
    • Loop Depth 0
      Computation Cost (Exclusive) 0%
      Computation Cost (Inclusive) 36%
      Loop Depth 0
      Data Movement Cost (Exclusive) 0%
      Data Movement Cost (Inclusive) 60%
      if (blur_y.s0.y.y < 33)
      Loop Depth 0
      Computation Cost (Exclusive) 0%
      Computation Cost (Inclusive) 36%
      Loop Depth 0
      Data Movement Cost (Exclusive) 0%
      Data Movement Cost (Inclusive) 60%
      Memory Type Auto
      Bit Size 16
      Vector Size 1
      Allocate blur_x
      TypeDim-1Dim-2
      uint1619204
      Loop Depth 0
      Computation Cost (Exclusive) 0%
      Computation Cost (Inclusive) 10%
      Loop Depth 0
      Data Movement Cost (Exclusive) 0%
      Data Movement Cost (Inclusive) 20%
      For (blur_y.s0.y.yi.$n.rebased)
      Loop Span
      2
      Loop Depth 1
      Computation Cost (Exclusive) 0%
      Computation Cost (Inclusive) 8%
      Loop Depth 1
      Data Movement Cost (Exclusive) 0%
      Data Movement Cost (Inclusive) 20%
      For (blur_y.s0.x.x)
      Loop Span
      120
      Loop Depth 2
      Computation Cost (Exclusive) 0%
      Computation Cost (Inclusive) 8%
      Loop Depth 2
      Data Movement Cost (Exclusive) 0%
      Data Movement Cost (Inclusive) 20%
      Produce blur_x
      Vector Size 16
      Bit Size 32
      Store blur_x
      WrittenRead
      blur_x: 16input: 48
      Loop Depth 2
      Computation Cost 2%
      Loop Depth 2
      Data Movement Cost 5%
      [Dense, Vector] Load input
      Ramp lanes 16
      Ramp stride 1
      Bit Size 32
      Vector Size 16
      Parameter input
      Loop Depth 2
      Computation Cost 2%
      Loop Depth 2
      Data Movement Cost 5%
      [Dense, Vector] Load input
      Ramp lanes 16
      Ramp stride 1
      Bit Size 32
      Vector Size 16
      Parameter input
      Loop Depth 2
      Computation Cost 1%
      Loop Depth 2
      Data Movement Cost 5%
      [Dense, Vector] Load input
      Ramp lanes 16
      Ramp stride 1
      Bit Size 32
      Vector Size 16
      Parameter input
      Loop Depth 0
      Computation Cost (Exclusive) 0%
      Computation Cost (Inclusive) 25%
      Loop Depth 0
      Data Movement Cost (Exclusive) 0%
      Data Movement Cost (Inclusive) 40%
      For (blur_y.s0.y.yi.$n)
      Loop Span
      32
      Loop Depth 1
      Computation Cost (Exclusive) 0%
      Computation Cost (Inclusive) 23%
      Loop Depth 1
      Data Movement Cost (Exclusive) 0%
      Data Movement Cost (Inclusive) 40%
      For (blur_y.s0.x.x)
      Loop Span
      120
      Loop Depth 2
      Computation Cost (Exclusive) 0%
      Computation Cost (Inclusive) 8%
      Loop Depth 2
      Data Movement Cost (Exclusive) 0%
      Data Movement Cost (Inclusive) 20%
      Produce blur_x
      Vector Size 16
      Bit Size 32
      Store blur_x
      WrittenRead
      blur_x: 16input: 48
      Loop Depth 2
      Computation Cost 2%
      Loop Depth 2
      Data Movement Cost 5%
      [Dense, Vector] Load input
      Ramp lanes 16
      Ramp stride 1
      Bit Size 32
      Vector Size 16
      Parameter input
      Loop Depth 2
      Computation Cost 2%
      Loop Depth 2
      Data Movement Cost 5%
      [Dense, Vector] Load input
      Ramp lanes 16
      Ramp stride 1
      Bit Size 32
      Vector Size 16
      Parameter input
      Loop Depth 2
      Computation Cost 1%
      Loop Depth 2
      Data Movement Cost 5%
      [Dense, Vector] Load input
      Ramp lanes 16
      Ramp stride 1
      Bit Size 32
      Vector Size 16
      Parameter input
      Loop Depth 2
      Computation Cost (Exclusive) 0%
      Computation Cost (Inclusive) 14%
      Loop Depth 2
      Data Movement Cost (Exclusive) 0%
      Data Movement Cost (Inclusive) 20%
      Consume blur_x
      Vector Size 16
      Bit Size 32
      Store blur_y
      WrittenRead
      blur_y: 16blur_x: 48
      Loop Depth 2
      Computation Cost 4%
      Loop Depth 2
      Data Movement Cost 5%
      [Dense, Vector] Load blur_x
      Ramp lanes 16
      Ramp stride 1
      Bit Size 32
      Vector Size 16
      Loop Depth 2
      Computation Cost 4%
      Loop Depth 2
      Data Movement Cost 5%
      [Dense, Vector] Load blur_x
      Ramp lanes 16
      Ramp stride 1
      Bit Size 32
      Vector Size 16
      Loop Depth 2
      Computation Cost 3%
      Loop Depth 2
      Data Movement Cost 5%
      [Dense, Vector] Load blur_x
      Ramp lanes 16
      Ramp stride 1
      Bit Size 32
      Vector Size 16
    • Loop Depth 0
      Computation Cost (Exclusive) 0%
      Computation Cost (Inclusive) 23%
      Loop Depth 0
      Data Movement Cost (Exclusive) 0%
      Data Movement Cost (Inclusive) 40%
      else
      Loop Depth 0
      Computation Cost (Exclusive) 0%
      Computation Cost (Inclusive) 23%
      Loop Depth 0
      Data Movement Cost (Exclusive) 0%
      Data Movement Cost (Inclusive) 40%
      Memory Type Auto
      Bit Size 16
      Vector Size 1
      Allocate blur_x
      TypeDim-1Dim-2
      uint1619204
      Loop Depth 0
      Computation Cost (Exclusive) 0%
      Computation Cost (Inclusive) 22%
      Loop Depth 0
      Data Movement Cost (Exclusive) 0%
      Data Movement Cost (Inclusive) 40%
      For (blur_y.s0.y.yi.$n.rebased)
      Loop Span
      34
      Loop Depth 1
      Computation Cost (Exclusive) 0%
      Computation Cost (Inclusive) 19%
      Loop Depth 1
      Data Movement Cost (Exclusive) 0%
      Data Movement Cost (Inclusive) 40%
      For (blur_y.s0.x.x)
      Loop Span
      120
      Loop Depth 2
      Computation Cost (Exclusive) 0%
      Computation Cost (Inclusive) 8%
      Loop Depth 2
      Data Movement Cost (Exclusive) 0%
      Data Movement Cost (Inclusive) 20%
      Produce blur_x
      Vector Size 16
      Bit Size 32
      Store blur_x
      WrittenRead
      blur_x: 16input: 48
      Loop Depth 2
      Computation Cost 2%
      Loop Depth 2
      Data Movement Cost 5%
      [Dense, Vector] Load input
      Ramp lanes 16
      Ramp stride 1
      Bit Size 32
      Vector Size 16
      Parameter input
      Loop Depth 2
      Computation Cost 2%
      Loop Depth 2
      Data Movement Cost 5%
      [Dense, Vector] Load input
      Ramp lanes 16
      Ramp stride 1
      Bit Size 32
      Vector Size 16
      Parameter input
      Loop Depth 2
      Computation Cost 1%
      Loop Depth 2
      Data Movement Cost 5%
      [Dense, Vector] Load input
      Ramp lanes 16
      Ramp stride 1
      Bit Size 32
      Vector Size 16
      Parameter input
      • If
        • Loop Depth 2
          Computation Cost (Exclusive) 0%
          Computation Cost (Inclusive) 10%
          Loop Depth 2
          Data Movement Cost (Exclusive) 0%
          Data Movement Cost (Inclusive) 20%
          if (uint1)t66
          Loop Depth 2
          Computation Cost (Exclusive) 0%
          Computation Cost (Inclusive) 10%
          Loop Depth 2
          Data Movement Cost (Exclusive) 0%
          Data Movement Cost (Inclusive) 20%
          Consume blur_x
          Vector Size 16
          Bit Size 32
          Store blur_y
          WrittenRead
          blur_y: 16blur_x: 48
          Loop Depth 2
          Computation Cost 3%
          Loop Depth 2
          Data Movement Cost 5%
          [Dense, Vector] Load blur_x
          Ramp lanes 16
          Ramp stride 1
          Bit Size 32
          Vector Size 16
          Loop Depth 2
          Computation Cost 3%
          Loop Depth 2
          Data Movement Cost 5%
          [Dense, Vector] Load blur_x
          Ramp lanes 16
          Ramp stride 1
          Bit Size 32
          Vector Size 16
          Loop Depth 2
          Computation Cost 3%
          Loop Depth 2
          Data Movement Cost 5%
          [Dense, Vector] Load blur_x
          Ramp lanes 16
          Ramp stride 1
          Bit Size 32
          Vector Size 16