From 1638cc24675301e53edae8b355abca48e6349ec9 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Thu, 26 Sep 2024 10:06:09 +0200 Subject: [PATCH] Make GPUInterpreter extensible Currently Enzyme uses it's own AbstractInterpreter, in particular to handle inlining blocking of functions with custom rules and to handle nested autodiff operations. - [ ] Create a version of Enzyme with this - [ ] Support a version of `gpuc.deferred(meta)` --- src/driver.jl | 25 +++-- src/interface.jl | 18 ++- src/irgen.jl | 27 ++--- src/jlgen.jl | 230 ++++++++++++++++++++++++++++++++++----- test/bpf_testsetup.jl | 3 +- test/gcn_testsetup.jl | 5 +- test/metal_testsetup.jl | 5 +- test/native_tests.jl | 14 +-- test/native_testsetup.jl | 5 +- test/plugin_testsetup.jl | 43 ++++++++ test/ptx_tests.jl | 79 ++++++++++++++ test/ptx_testsetup.jl | 4 +- test/runtests.jl | 2 +- test/spirv_testsetup.jl | 5 +- 14 files changed, 389 insertions(+), 76 deletions(-) diff --git a/src/driver.jl b/src/driver.jl index 696a9ce1..5b64cfa2 100644 --- a/src/driver.jl +++ b/src/driver.jl @@ -42,7 +42,7 @@ end ## deferred compilation """ - var"gpuc.deferred"(f, args...)::Ptr{Cvoid} + var"gpuc.deferred"(meta, f, args...)::Ptr{Cvoid} As if we were to call `f(args...)` but instead we are putting down a marker and return a function pointer to later @@ -154,10 +154,11 @@ const __llvm_initialized = Ref(false) @timeit_debug to "IR generation" begin ir, compiled = irgen(job) + edge = Edge(inference_metadata(job), job.source) if job.config.entry_abi === :specfunc - entry_fn = compiled[job.source].specfunc + entry_fn = compiled[edge].specfunc else - entry_fn = compiled[job.source].func + entry_fn = compiled[edge].func end entry = functions(ir)[entry_fn] end @@ -198,24 +199,28 @@ const __llvm_initialized = Ref(false) return val end - worklist = Dict{Any, Vector{LLVM.CallInst}}() + worklist = Dict{Edge, Vector{LLVM.CallInst}}() for use in uses(dyn_marker) # decode the call call = user(use)::LLVM.CallInst - dyn_mi_inst = find_base_object(operands(call)[1]) + dyn_meta_inst = find_base_object(operands(call)[1]) + @compiler_assert isa(dyn_meta_inst, LLVM.ConstantInt) job + dyn_mi_inst = find_base_object(operands(call)[2]) @compiler_assert isa(dyn_mi_inst, LLVM.ConstantInt) job + dyn_meta = Base.unsafe_pointer_to_objref( + convert(Ptr{Cvoid}, convert(Int, dyn_meta_inst))) dyn_mi = Base.unsafe_pointer_to_objref( - convert(Ptr{Cvoid}, convert(Int, dyn_mi_inst))) - push!(get!(worklist, dyn_mi, LLVM.CallInst[]), call) + convert(Ptr{Cvoid}, convert(Int, dyn_mi_inst)))::MethodInstance + push!(get!(worklist, Edge(dyn_meta, dyn_mi), LLVM.CallInst[]), call) end - for dyn_mi in keys(worklist) - dyn_fn_name = compiled[dyn_mi].specfunc + for dyn_edge in keys(worklist) + dyn_fn_name = compiled[dyn_edge].specfunc dyn_fn = functions(ir)[dyn_fn_name] # insert a pointer to the function everywhere the entry is used T_ptr = convert(LLVMType, Ptr{Cvoid}) - for call in worklist[dyn_mi] + for call in worklist[dyn_edge] @dispose builder=IRBuilder() begin position!(builder, call) fptr = if LLVM.version() >= v"17" diff --git a/src/interface.jl b/src/interface.jl index 1346c858..a8af2d9c 100644 --- a/src/interface.jl +++ b/src/interface.jl @@ -89,6 +89,7 @@ Several keyword arguments can be used to customize the compilation process: struct CompilerConfig{T,P} target::T params::P + meta kernel::Bool name::Union{Nothing,String} @@ -98,6 +99,7 @@ struct CompilerConfig{T,P} function CompilerConfig(target::AbstractCompilerTarget, params::AbstractCompilerParams; + meta = nothing, kernel=true, name=nothing, entry_abi=:specfunc, @@ -106,16 +108,16 @@ struct CompilerConfig{T,P} if entry_abi ∉ (:specfunc, :func) error("Unknown entry_abi=$entry_abi") end - new{typeof(target), typeof(params)}(target, params, kernel, name, entry_abi, + new{typeof(target), typeof(params)}(target, params, meta, kernel, name, entry_abi, always_inline, opt_level) end end # copy constructor -CompilerConfig(cfg::CompilerConfig; target=cfg.target, params=cfg.params, +CompilerConfig(cfg::CompilerConfig; target=cfg.target, params=cfg.params, meta=cfg.meta, kernel=cfg.kernel, name=cfg.name, entry_abi=cfg.entry_abi, always_inline=cfg.always_inline, opt_level=cfg.opt_level) = - CompilerConfig(target, params; kernel, entry_abi, name, always_inline, opt_level) + CompilerConfig(target, params; meta, kernel, entry_abi, name, always_inline, opt_level) function Base.show(io::IO, @nospecialize(cfg::CompilerConfig{T})) where {T} print(io, "CompilerConfig for ", T) @@ -124,6 +126,7 @@ end function Base.hash(cfg::CompilerConfig, h::UInt) h = hash(cfg.target, h) h = hash(cfg.params, h) + h = hash(cfg.meta, h)::UInt h = hash(cfg.kernel, h) h = hash(cfg.name, h) @@ -178,15 +181,17 @@ runtime_module(@nospecialize(job::CompilerJob)) = error("Not implemented") # check if a function is an intrinsic that can assumed to be always available isintrinsic(@nospecialize(job::CompilerJob), fn::String) = false +inference_metadata(@nospecialize(job::CompilerJob)) = job.config.meta + # provide a specific interpreter to use. if VERSION >= v"1.11.0-DEV.1552" get_interpreter(@nospecialize(job::CompilerJob)) = - GPUInterpreter(job.world; method_table=method_table(job), + GPUInterpreter(job.world; meta=inference_metadata(job), method_table=method_table(job), token=ci_cache_token(job), inf_params=inference_params(job), opt_params=optimization_params(job)) else get_interpreter(@nospecialize(job::CompilerJob)) = - GPUInterpreter(job.world; method_table=method_table(job), + GPUInterpreter(job.world; meta=inference_metadata(job), method_table=method_table(job), code_cache=ci_cache(job), inf_params=inference_params(job), opt_params=optimization_params(job)) end @@ -227,10 +232,11 @@ struct GPUCompilerCacheToken target_type::Type always_inline::Bool method_table::Core.MethodTable + metadata end ci_cache_token(@nospecialize(job::CompilerJob)) = - GPUCompilerCacheToken(typeof(job.config.target), job.config.always_inline, method_table(job)) + GPUCompilerCacheToken(typeof(job.config.target), job.config.always_inline, method_table(job), inference_metadata(job)) # the codeinstance cache to use -- should only be used for the constructor if VERSION >= v"1.11.0-DEV.1552" diff --git a/src/irgen.jl b/src/irgen.jl index 874ed961..cdeceb3c 100644 --- a/src/irgen.jl +++ b/src/irgen.jl @@ -2,10 +2,11 @@ function irgen(@nospecialize(job::CompilerJob)) mod, compiled = @timeit_debug to "emission" compile_method_instance(job) + edge = Edge(inference_metadata(job), job.source) if job.config.entry_abi === :specfunc - entry_fn = compiled[job.source].specfunc + entry_fn = compiled[edge].specfunc else - entry_fn = compiled[job.source].func + entry_fn = compiled[edge].func end @assert entry_fn !== nothing entry = functions(mod)[entry_fn] @@ -70,25 +71,25 @@ function irgen(@nospecialize(job::CompilerJob)) entry = deprecation_marker end if job.config.entry_abi === :specfunc - func = compiled[job.source].func + func = compiled[edge].func specfunc = LLVM.name(entry) else func = LLVM.name(entry) - specfunc = compiled[job.source].specfunc + specfunc = compiled[edge].specfunc end - compiled[job.source] = - (; compiled[job.source].ci, func, specfunc) + compiled[edge] = + (; compiled[edge].ci, func, specfunc) # Earlier we sanitize global names, this invalidates the # func, specfunc names safed in compiled. Update the names now, # such that when when use the compiled mappings to lookup the # llvm function for a methodinstance (deferred codegen) we have # valid targets. - for mi in keys(compiled) - mi == job.source && continue - ci, func, specfunc = compiled[mi] - compiled[mi] = (; ci, func=safe_name(func), specfunc=safe_name(specfunc)) + for other in keys(compiled) + other == edge && continue + ci, func, specfunc = compiled[other] + compiled[other] = (; ci, func=safe_name(func), specfunc=safe_name(specfunc)) end # TODO: Should we rewrite gpuc.lookup here? @@ -111,11 +112,11 @@ function irgen(@nospecialize(job::CompilerJob)) # internalize all functions, but keep exported global variables. linkage!(entry, LLVM.API.LLVMExternalLinkage) preserved_gvs = String[LLVM.name(entry)] - for mi in keys(compiled) + for other in keys(compiled) # delay internalizing of deferred calls since # gpuc.lookup is not yet rewriten. - mi == job.source && continue - _, _, specfunc = compiled[mi] + other == edge && continue + _, _, specfunc = compiled[other] push!(preserved_gvs, specfunc) # this could be deleted if we rewrite gpuc.lookup earlier end for gvar in globals(mod) diff --git a/src/jlgen.jl b/src/jlgen.jl index c6be8c94..213b45c4 100644 --- a/src/jlgen.jl +++ b/src/jlgen.jl @@ -320,6 +320,7 @@ else end struct GPUInterpreter <: CC.AbstractInterpreter + meta::Any world::UInt method_table::GPUMethodTableView @@ -336,6 +337,7 @@ end @static if HAS_INTEGRATED_CACHE function GPUInterpreter(world::UInt=Base.get_world_counter(); + meta = nothing, method_table::MTType, token::Any, inf_params::CC.InferenceParams, @@ -345,19 +347,20 @@ function GPUInterpreter(world::UInt=Base.get_world_counter(); method_table = get_method_table_view(world, method_table) inf_cache = Vector{CC.InferenceResult}() - return GPUInterpreter(world, method_table, + return GPUInterpreter(meta, world, method_table, token, inf_cache, inf_params, opt_params) end function GPUInterpreter(interp::GPUInterpreter; + meta=interp.meta, world::UInt=interp.world, method_table::GPUMethodTableView=interp.method_table, token::Any=interp.token, inf_cache::Vector{CC.InferenceResult}=interp.inf_cache, inf_params::CC.InferenceParams=interp.inf_params, opt_params::CC.OptimizationParams=interp.opt_params) - return GPUInterpreter(world, method_table, + return GPUInterpreter(meta, world, method_table, token, inf_cache, inf_params, opt_params) end @@ -365,6 +368,7 @@ end else function GPUInterpreter(world::UInt=Base.get_world_counter(); + meta=nothing, method_table::MTType, code_cache::CodeCache, inf_params::CC.InferenceParams, @@ -374,19 +378,20 @@ function GPUInterpreter(world::UInt=Base.get_world_counter(); method_table = get_method_table_view(world, method_table) inf_cache = Vector{CC.InferenceResult}() - return GPUInterpreter(world, method_table, + return GPUInterpreter(meta, world, method_table, code_cache, inf_cache, inf_params, opt_params) end function GPUInterpreter(interp::GPUInterpreter; + meta=interp.meta, world::UInt=interp.world, method_table::GPUMethodTableView=interp.method_table, code_cache::CodeCache=interp.code_cache, inf_cache::Vector{CC.InferenceResult}=interp.inf_cache, inf_params::CC.InferenceParams=interp.inf_params, opt_params::CC.OptimizationParams=interp.opt_params) - return GPUInterpreter(world, method_table, + return GPUInterpreter(meta, world, method_table, code_cache, inf_cache, inf_params, opt_params) end @@ -437,28 +442,76 @@ function CC.concrete_eval_eligible(interp::GPUInterpreter, end +within_gpucompiler() = false + ## deferred compilation struct DeferredCallInfo <: CC.CallInfo + meta::Any rt::DataType info::CC.CallInfo end # recognize calls to gpuc.deferred and save DeferredCallInfo metadata -function CC.abstract_call_known(interp::GPUInterpreter, @nospecialize(f), - arginfo::CC.ArgInfo, si::CC.StmtInfo, sv::CC.AbsIntState, - max_methods::Int = CC.get_max_methods(interp, f, sv)) +# default implementation, extensible through meta argument. +# XXX: (or should we dispatch on `f`)? +function abstract_call_known(meta::Nothing, interp::GPUInterpreter, @nospecialize(f), + arginfo::CC.ArgInfo, si::CC.StmtInfo, sv::CC.AbsIntState, + max_methods::Int = CC.get_max_methods(interp, f, sv)) (; fargs, argtypes) = arginfo if f === var"gpuc.deferred" - argvec = argtypes[2:end] + argvec = argtypes[3:end] call = CC.abstract_call(interp, CC.ArgInfo(nothing, argvec), si, sv, max_methods) - callinfo = DeferredCallInfo(call.rt, call.info) + metaT = argtypes[2] + meta = CC.singleton_type(metaT) + if meta === nothing + if metaT isa Core.Const + meta = metaT.val + else + # meta is not a singleton type result may depend on runtime configuration + add_remark!(interp, sv, "Skipped gpuc.deferred since meta not constant") + @static if VERSION < v"1.11.0-" + return CC.CallMeta(Union{}, CC.Effects(), CC.NoCallInfo()) + else + return CC.CallMeta(Union{}, Union{}, CC.Effects(), CC.NoCallInfo()) + end + end + end + + callinfo = DeferredCallInfo(meta, call.rt, call.info) @static if VERSION < v"1.11.0-" return CC.CallMeta(Ptr{Cvoid}, CC.Effects(), callinfo) else return CC.CallMeta(Ptr{Cvoid}, Union{}, CC.Effects(), callinfo) end + elseif f === within_gpucompiler + if length(argtypes) != 1 + @static if VERSION < v"1.11.0-" + return CC.CallMeta(Union{}, CC.Effects(), CC.NoCallInfo()) + else + return CC.CallMeta(Union{}, Union{}, CC.Effects(), CC.NoCallInfo()) + end + end + @static if VERSION < v"1.11.0-" + return CC.CallMeta(Core.Const(true), CC.EFFECTS_TOTAL, CC.MethodResultPure()) + else + return CC.CallMeta(Core.Const(true), Union{}, CC.EFFECTS_TOTAL, CC.MethodResultPure(),) + end end + return nothing +end + +function CC.abstract_call_known(interp::GPUInterpreter, @nospecialize(f), + arginfo::CC.ArgInfo, si::CC.StmtInfo, sv::CC.AbsIntState, + max_methods::Int = CC.get_max_methods(interp, f, sv)) + candidate = abstract_call_known(interp.meta, interp, f, arginfo, si, sv, max_methods) + if candidate === nothing && interp.meta !== nothing + candidate = abstract_call_known(interp.meta, interp, f, arginfo, si, sv, max_methods) + end + if candidate !== nothing + return candidate + end + return @invoke CC.abstract_call_known(interp::CC.AbstractInterpreter, f, arginfo::CC.ArgInfo, si::CC.StmtInfo, sv::CC.AbsIntState, max_methods::Int) @@ -485,23 +538,29 @@ function CC.handle_call!(todo::Vector{Pair{Int,Any}}, ir::CC.IRCode, idx::CC.Int args = Any[ "extern gpuc.lookup", Ptr{Cvoid}, - Core.svec(Any, Any, match.spec_types.parameters[2:end]...), # Must use Any for MethodInstance or ftype + Core.svec(Any, Any, Any, match.spec_types.parameters[2:end]...), # Must use Any for MethodInstance or ftype 0, QuoteNode(:llvmcall), + info.meta, case.invoke, - stmt.args[2:end]... + stmt.args[3:end]... ] stmt.head = :foreigncall stmt.args = args return nothing end +struct Edge + meta::Any + mi::MethodInstance +end + struct DeferredEdges - edges::Vector{MethodInstance} + edges::Vector{Edge} end function find_deferred_edges(ir::CC.IRCode) - edges = MethodInstance[] + edges = Edge[] # XXX: can we add this instead in handle_call? for stmt in ir.stmts inst = stmt[:inst] @@ -509,8 +568,9 @@ function find_deferred_edges(ir::CC.IRCode) expr = inst::Expr if expr.head === :foreigncall && expr.args[1] == "extern gpuc.lookup" - deferred_mi = expr.args[6] - push!(edges, deferred_mi) + deferred_meta = expr.args[6] + deferred_mi = expr.args[7] + push!(edges, Edge(deferred_meta, deferred_mi)) end end unique!(edges) @@ -542,6 +602,116 @@ function CC.finish(interp::GPUInterpreter, opt::CC.OptimizationState, ir::CC.IRC end end +import .CC: CallInfo +struct NoInlineCallInfo <: CallInfo + info::CallInfo # wrapped call + tt::Any # ::Type + kind::Symbol + NoInlineCallInfo(@nospecialize(info::CallInfo), @nospecialize(tt), kind::Symbol) = + new(info, tt, kind) +end + +CC.nsplit_impl(info::NoInlineCallInfo) = CC.nsplit(info.info) +CC.getsplit_impl(info::NoInlineCallInfo, idx::Int) = CC.getsplit(info.info, idx) +CC.getresult_impl(info::NoInlineCallInfo, idx::Int) = CC.getresult(info.info, idx) +struct AlwaysInlineCallInfo <: CallInfo + info::CallInfo # wrapped call + tt::Any # ::Type + AlwaysInlineCallInfo(@nospecialize(info::CallInfo), @nospecialize(tt)) = new(info, tt) +end + +CC.nsplit_impl(info::AlwaysInlineCallInfo) = Core.Compiler.nsplit(info.info) +CC.getsplit_impl(info::AlwaysInlineCallInfo, idx::Int) = CC.getsplit(info.info, idx) +CC.getresult_impl(info::AlwaysInlineCallInfo, idx::Int) = CC.getresult(info.info, idx) + + +function inlining_handler(meta::Nothing, interp::GPUInterpreter, @nospecialize(atype), callinfo) + return nothing +end + +using Core.Compiler: ArgInfo, StmtInfo, AbsIntState +function CC.abstract_call_gf_by_type(interp::GPUInterpreter, @nospecialize(f), arginfo::ArgInfo, + si::StmtInfo, @nospecialize(atype), sv::AbsIntState, max_methods::Int) + ret = @invoke CC.abstract_call_gf_by_type(interp::CC.AbstractInterpreter, f::Any, arginfo::ArgInfo, + si::StmtInfo, atype::Any, sv::AbsIntState, max_methods::Int) + + callinfo = nothing + if interp.meta !== nothing + callinfo = inlining_handler(interp.meta, interp, atype, ret.info) + end + if callinfo === nothing + callinfo = inlining_handler(nothing, interp, atype, ret.info) + end + if callinfo === nothing + callinfo = ret.info + end + + @static if VERSION ≥ v"1.11-" + return CC.CallMeta(ret.rt, ret.exct, ret.effects, callinfo) + else + return CC.CallMeta(ret.rt, ret.effects, callinfo) + end +end + +@static if VERSION < v"1.12.0-DEV.45" +let # overload `inlining_policy` + @static if VERSION ≥ v"1.11.0-DEV.879" + sigs_ex = :( + interp::GPUInterpreter, + @nospecialize(src), + @nospecialize(info::CC.CallInfo), + stmt_flag::UInt32, + ) + args_ex = :( + interp::CC.AbstractInterpreter, + src::Any, + info::CC.CallInfo, + stmt_flag::UInt32, + ) + else + sigs_ex = :( + interp::GPUInterpreter, + @nospecialize(src), + @nospecialize(info::CC.CallInfo), + stmt_flag::UInt8, + mi::MethodInstance, + argtypes::Vector{Any}, + ) + args_ex = :( + interp::CC.AbstractInterpreter, + src::Any, + info::CC.CallInfo, + stmt_flag::UInt8, + mi::MethodInstance, + argtypes::Vector{Any}, + ) + end + @eval function CC.inlining_policy($(sigs_ex.args...)) + if info isa NoInlineCallInfo + @safe_debug "Blocking inlining" info.tt info.kind + return nothing + elseif info isa AlwaysInlineCallInfo + @safe_debug "Forcing inlining for" info.tt + return src + end + return @invoke CC.inlining_policy($(args_ex.args...)) + end +end +else +function CC.src_inlining_policy(interp::GPUInterpreter, + @nospecialize(src), @nospecialize(info::CC.CallInfo), stmt_flag::UInt32) + + if info isa NoInlineCallInfo + @safe_debug "Blocking inlining" info.tt info.kind + return false + elseif info isa AlwaysInlineCallInfo + @safe_debug "Forcing inlining for" info.tt + return true + end + return @invoke CC.src_inlining_policy(interp::CC.AbstractInterpreter, src, info::CC.CallInfo, stmt_flag::UInt32) +end +end + ## world view of the cache using Core.Compiler: WorldView @@ -697,14 +867,16 @@ function compile_method_instance(@nospecialize(job::CompilerJob)) # generate for the same mi multiple LLVM functions. # `outstanding` are the missing edges that were not compiled by `compile_method_instance` # Currently these edges are generated through deferred codegen. - compiled = IdDict() + compiled = IdDict{Edge, Any}() llvm_mod, outstanding = compile_method_instance(job, compiled) worklist = outstanding while !isempty(worklist) - source = pop!(worklist) - haskey(compiled, source) && continue # We have fulfilled the request already + edge = pop!(worklist) + haskey(compiled, edge) && continue # We have fulfilled the request already + source = edge.mi + meta = edge.meta # Create a new compiler job for this edge, reusing the config settings from the inital one - job2 = CompilerJob(source, job.config) + job2 = CompilerJob(source, CompilerConfig(job.config; meta)) llvm_mod2, outstanding = compile_method_instance(job2, compiled) append!(worklist, outstanding) # merge worklist with new outstanding edges @assert context(llvm_mod) == context(llvm_mod2) @@ -714,7 +886,7 @@ function compile_method_instance(@nospecialize(job::CompilerJob)) return llvm_mod, compiled end -function compile_method_instance(@nospecialize(job::CompilerJob), compiled::IdDict{Any, Any}) +function compile_method_instance(@nospecialize(job::CompilerJob), compiled::IdDict{Edge, Any}) # populate the cache interp = get_interpreter(job) cache = CC.code_cache(interp) @@ -790,6 +962,7 @@ function compile_method_instance(@nospecialize(job::CompilerJob), compiled::IdDi end # process all compiled method instances + meta = inference_metadata(job) for mi in method_instances ci = ci_cache_lookup(cache, mi, job.world, job.world) ci === nothing && continue @@ -825,14 +998,15 @@ function compile_method_instance(@nospecialize(job::CompilerJob), compiled::IdDi # removed or renamed during optimization, so we store their name instead. # FIXME: Enable this assert when we have a fully featured worklist # @assert !haskey(compiled, mi) - compiled[mi] = (; ci, func=llvm_func, specfunc=llvm_specfunc) + compiled[Edge(meta, mi)] = (; ci, func=llvm_func, specfunc=llvm_specfunc) end # Collect the deferred edges - outstanding = Any[] + outstanding = Edge[] for mi in method_instances - !haskey(compiled, mi) && continue # Equivalent to ci_cache_lookup == nothing - ci = compiled[mi].ci + edge = Edge(meta, mi) + !haskey(compiled, edge) && continue # Equivalent to ci_cache_lookup == nothing + ci = compiled[edge].ci @static if VERSION >= v"1.11.0-" edges = CC.traverse_analysis_results(ci) do @nospecialize result return result isa DeferredEdges ? result : return @@ -844,16 +1018,16 @@ function compile_method_instance(@nospecialize(job::CompilerJob), compiled::IdDi end end if edges !== nothing - for deferred_mi in (edges::DeferredEdges).edges - if !haskey(compiled, deferred_mi) - push!(outstanding, deferred_mi) + for other in (edges::DeferredEdges).edges + if !haskey(compiled, other) + push!(outstanding, other) end end end end # ensure that the requested method instance was compiled - @assert haskey(compiled, job.source) + @assert haskey(compiled, Edge(meta, job.source)) return llvm_mod, outstanding end diff --git a/test/bpf_testsetup.jl b/test/bpf_testsetup.jl index 0570a6e0..b7fc7dfc 100644 --- a/test/bpf_testsetup.jl +++ b/test/bpf_testsetup.jl @@ -10,7 +10,8 @@ struct CompilerParams <: AbstractCompilerParams end GPUCompiler.runtime_module(::CompilerJob{<:Any,CompilerParams}) = TestRuntime function create_job(@nospecialize(func), @nospecialize(types); - kernel::Bool=false, always_inline=false, kwargs...) + kernel::Bool=false, always_inline=false, + meta=nothing, kwargs...) source = methodinstance(typeof(func), Base.to_tuple_type(types), Base.get_world_counter()) target = BPFCompilerTarget() params = CompilerParams() diff --git a/test/gcn_testsetup.jl b/test/gcn_testsetup.jl index 846db4b6..d7a6b88d 100644 --- a/test/gcn_testsetup.jl +++ b/test/gcn_testsetup.jl @@ -10,11 +10,12 @@ struct CompilerParams <: AbstractCompilerParams end GPUCompiler.runtime_module(::CompilerJob{<:Any,CompilerParams}) = TestRuntime function create_job(@nospecialize(func), @nospecialize(types); - kernel::Bool=false, always_inline=false, kwargs...) + kernel::Bool=false, always_inline=false, + meta=nothing, kwargs...) source = methodinstance(typeof(func), Base.to_tuple_type(types), Base.get_world_counter()) target = GCNCompilerTarget(dev_isa="gfx900") params = CompilerParams() - config = CompilerConfig(target, params; kernel, always_inline) + config = CompilerConfig(target, params; kernel, always_inline, meta) CompilerJob(source, config), kwargs end diff --git a/test/metal_testsetup.jl b/test/metal_testsetup.jl index 0055cb18..2fbd956f 100644 --- a/test/metal_testsetup.jl +++ b/test/metal_testsetup.jl @@ -10,11 +10,12 @@ struct CompilerParams <: AbstractCompilerParams end GPUCompiler.runtime_module(::CompilerJob{<:Any,CompilerParams}) = TestRuntime function create_job(@nospecialize(func), @nospecialize(types); - kernel::Bool=false, always_inline=false, kwargs...) + kernel::Bool=false, always_inline=false, + meta=nothing, kwargs...) source = methodinstance(typeof(func), Base.to_tuple_type(types), Base.get_world_counter()) target = MetalCompilerTarget(; macos=v"12.2", metal=v"3.0", air=v"3.0") params = CompilerParams() - config = CompilerConfig(target, params; kernel, always_inline) + config = CompilerConfig(target, params; kernel, always_inline, meta) CompilerJob(source, config), kwargs end diff --git a/test/native_tests.jl b/test/native_tests.jl index cd4a20c0..d07e930c 100644 --- a/test/native_tests.jl +++ b/test/native_tests.jl @@ -43,12 +43,12 @@ end meth = only(methods(outer, (Int,))) - mis = filter(mi->mi.def == meth, keys(meta.compiled)) + mis = filter(edge->edge.mi.def == meth, keys(meta.compiled)) @test length(mis) == 1 - other_mis = filter(mi->mi.def != meth, keys(meta.compiled)) + other_mis = filter(edge->edge.mi.def != meth, keys(meta.compiled)) @test length(other_mis) == 1 - @test only(other_mis).def in methods(inner) + @test only(other_mis).mi.def in methods(inner) end end @@ -63,11 +63,11 @@ end meth = only(methods(foo, (Float64,))) - mis = filter(mi->mi.def == meth, keys(meta.compiled)) + mis = filter(edge->edge.mi.def == meth, keys(meta.compiled)) @test length(mis) == 1 - inner_methods = filter(keys(meta.compiled)) do mi - mi.def in methods(inner) && mi.specTypes == Tuple{typeof(inner), Float64} + inner_methods = filter(keys(meta.compiled)) do edge + edge.mi.def in methods(inner) && edge.mi.specTypes == Tuple{typeof(inner), Float64} end @test length(inner_methods) == 1 end @@ -166,7 +166,7 @@ end @testset "deferred" begin @gensym child kernel unrelated @eval @noinline $child(i) = i - @eval $kernel(i) = GPUCompiler.var"gpuc.deferred"($child, i) + @eval $kernel(i) = GPUCompiler.var"gpuc.deferred"(nothing, $child, i) # smoke test job, _ = Native.create_job(eval(kernel), (Int64,)) diff --git a/test/native_testsetup.jl b/test/native_testsetup.jl index 3406276c..2f674470 100644 --- a/test/native_testsetup.jl +++ b/test/native_testsetup.jl @@ -26,11 +26,12 @@ GPUCompiler.can_safepoint(@nospecialize(job::NativeCompilerJob)) = job.config.pa function create_job(@nospecialize(func), @nospecialize(types); kernel::Bool=false, entry_abi=:specfunc, entry_safepoint::Bool=false, always_inline=false, - method_table=test_method_table, kwargs...) + method_table=test_method_table, + meta=nothing, kwargs...) source = methodinstance(typeof(func), Base.to_tuple_type(types), Base.get_world_counter()) target = NativeCompilerTarget() params = CompilerParams(entry_safepoint, method_table) - config = CompilerConfig(target, params; kernel, entry_abi, always_inline) + config = CompilerConfig(target, params; kernel, entry_abi, always_inline, meta) CompilerJob(source, config), kwargs end diff --git a/test/plugin_testsetup.jl b/test/plugin_testsetup.jl index 90672e3d..df01c295 100644 --- a/test/plugin_testsetup.jl +++ b/test/plugin_testsetup.jl @@ -28,4 +28,47 @@ end GPUCompiler.register_plugin!("gpucompiler.mark", false, pipeline_callback=remove_mark!) +current_inlinestate() = nothing + +abstract type InlineStateMeta end +struct AlwaysInlineMeta <: InlineStateMeta end +struct NeverInlineMeta <: InlineStateMeta end + +import GPUCompiler: abstract_call_known, GPUInterpreter +import Core.Compiler: CallMeta, Effects, NoCallInfo, ArgInfo, + StmtInfo, AbsIntState, EFFECTS_TOTAL, + MethodResultPure + +function abstract_call_known(meta::InlineStateMeta, interp::GPUInterpreter, @nospecialize(f), + arginfo::ArgInfo, si::StmtInfo, sv::AbsIntState, max_methods::Int) + (; fargs, argtypes) = arginfo + + if f === current_inlinestate + if length(argtypes) != 1 + @static if VERSION < v"1.11.0-" + return CallMeta(Union{}, Effects(), NoCallInfo()) + else + return CallMeta(Union{}, Union{}, Effects(), NoCallInfo()) + end + end + @static if VERSION < v"1.11.0-" + return CallMeta(Core.Const(meta), EFFECTS_TOTAL, MethodResultPure()) + else + return CallMeta(Core.Const(meta), Union{}, EFFECTS_TOTAL, MethodResultPure()) + end + end + return nothing +end + +import GPUCompiler: inlining_handler, NoInlineCallInfo, AlwaysInlineCallInfo +function inlining_handler(meta::InlineStateMeta, interp::GPUInterpreter, @nospecialize(atype), callinfo) + if meta isa NeverInlineMeta + return NoInlineCallInfo(callinfo, atype, :default) + elseif meta isa AlwaysInlineMeta + return AlwaysInlineCallInfo(callinfo, atype) + end + return nothing +end + + end \ No newline at end of file diff --git a/test/ptx_tests.jl b/test/ptx_tests.jl index 600561f5..6476afde 100644 --- a/test/ptx_tests.jl +++ b/test/ptx_tests.jl @@ -277,6 +277,19 @@ end @test "We did not crash!" != "" end +@testset "within_gpucompiler" begin + function kernel(a) + unsafe_store!(a, GPUCompiler.within_gpucompiler()) + end + ir = sprint(io->InteractiveUtils.code_llvm(io, kernel, Tuple{Ptr{Bool}})) + @test occursin("store i8 0,", ir) + @test !occursin("store i8 1,", ir) + + ir = sprint(io->PTX.code_llvm(io, kernel, Tuple{Ptr{Bool}})) + @test !occursin("store i8 0,", ir) + @test occursin("store i8 1,", ir) +end + @testset "exception arguments" begin function kernel(a) unsafe_store!(a, trunc(Int, unsafe_load(a))) @@ -425,4 +438,70 @@ import InteractiveUtils ir = sprint(io->PTX.code_llvm(io, kernel, Tuple{Int})) @test !occursin("gpucompiler.mark", ir) end + +@testset "current_inlinestate" begin + function kernel(a) + state = Plugin.current_inlinestate() + if state === nothing + unsafe_store!(a, 0) + elseif state === Plugin.NeverInlineMeta() + unsafe_store!(a, 1) + elseif state === Plugin.AlwaysInlineMeta() + unsafe_store!(a, 2) + end + return nothing + end + ir = sprint(io->InteractiveUtils.code_llvm(io, kernel, Tuple{Ptr{Int64}})) + @test occursin("store i64 0,", ir) + @test !occursin("store i64 1,", ir) + @test !occursin("store i64 2,", ir) + + ir = sprint(io->PTX.code_llvm(io, kernel, Tuple{Ptr{Int64}})) + @test occursin("store i64 0,", ir) + @test !occursin("store i64 1,", ir) + @test !occursin("store i64 2,", ir) + + ir = sprint(io->PTX.code_llvm(io, kernel, Tuple{Ptr{Int64}}, meta=Plugin.NeverInlineMeta())) + @test !occursin("store i64 0,", ir) + @test occursin("call fastcc void @julia_unsafe_store", ir) # call fastcc void @julia_unsafe_store__1397(i64 zeroext %0, i64 signext 1) + @test !occursin("store i64 2,", ir) + + ir = sprint(io->PTX.code_llvm(io, kernel, Tuple{Ptr{Int64}}, meta=Plugin.AlwaysInlineMeta())) + @test !occursin("store i64 0,", ir) + @test !occursin("store i64 1,", ir) + @test occursin("store i64 2,", ir) +end + +@testset "InlineStateMeta" begin + + @noinline function noinline(x) + x^2 + end + + @inline function inline(x) + x^2 + end + + function kernel_noinline(a, x) + unsafe_store!(a, noinline(x)) + nothing + end + + function kernel_inline(a, x) + unsafe_store!(a, inline(x)) + nothing + end + + ir = sprint(io->PTX.code_llvm(io, kernel_noinline, Tuple{Ptr{Int64}, Int64})) + @test occursin("call fastcc i64 @julia_noinline", ir) + + ir = sprint(io->PTX.code_llvm(io, kernel_noinline, Tuple{Ptr{Int64}, Int64}, meta=Plugin.AlwaysInlineMeta())) + @test !occursin("call fastcc i64 @julia_noinline", ir) + + ir = sprint(io->PTX.code_llvm(io, kernel_inline, Tuple{Ptr{Int64}, Int64})) + @test !occursin("call fastcc i64 @julia_inline", ir) + + ir = sprint(io->PTX.code_llvm(io, kernel_inline, Tuple{Ptr{Int64}, Int64}, meta=Plugin.NeverInlineMeta())) + @test occursin("call fastcc i64 @julia_inline", ir) +end end #testitem diff --git a/test/ptx_testsetup.jl b/test/ptx_testsetup.jl index 89516283..33505756 100644 --- a/test/ptx_testsetup.jl +++ b/test/ptx_testsetup.jl @@ -39,13 +39,13 @@ GPUCompiler.runtime_module(::PTXCompilerJob) = PTXTestRuntime function create_job(@nospecialize(func), @nospecialize(types); kernel::Bool=false, minthreads=nothing, maxthreads=nothing, blocks_per_sm=nothing, - maxregs=nothing, always_inline=false, kwargs...) + maxregs=nothing, always_inline=false, meta=nothing, kwargs...) source = methodinstance(typeof(func), Base.to_tuple_type(types), Base.get_world_counter()) target = PTXCompilerTarget(;cap=v"7.0", minthreads, maxthreads, blocks_per_sm, maxregs) params = CompilerParams() - config = CompilerConfig(target, params; kernel, always_inline) + config = CompilerConfig(target, params; kernel, always_inline, meta) CompilerJob(source, config), kwargs end diff --git a/test/runtests.jl b/test/runtests.jl index 199e641f..66c0fff6 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -22,7 +22,7 @@ runtests(GPUCompiler; nworkers=min(Sys.CPU_THREADS,4), nworker_threads=1, end end - if ti.name in ["PTX", "GCN", "PTX precompile"] && Sys.isapple() + if ti.name in ["PTX", "GCN", "PTX precompile", "PTX plugin"] && Sys.isapple() # support for AMDGPU and NVTX on macOS has been removed from Julia's LLVM build return false end diff --git a/test/spirv_testsetup.jl b/test/spirv_testsetup.jl index f1221545..f1fd9325 100644 --- a/test/spirv_testsetup.jl +++ b/test/spirv_testsetup.jl @@ -11,11 +11,12 @@ GPUCompiler.runtime_module(::CompilerJob{<:Any,CompilerParams}) = TestRuntime function create_job(@nospecialize(func), @nospecialize(types); kernel::Bool=false, always_inline=false, - supports_fp16=true, supports_fp64=true, kwargs...) + supports_fp16=true, supports_fp64=true, + meta=nothing, kwargs...) source = methodinstance(typeof(func), Base.to_tuple_type(types), Base.get_world_counter()) target = SPIRVCompilerTarget(; supports_fp16, supports_fp64) params = CompilerParams() - config = CompilerConfig(target, params; kernel, always_inline) + config = CompilerConfig(target, params; kernel, always_inline, meta) CompilerJob(source, config), kwargs end