|
FreeTensor
|
#include <schedule.h>
Public Types | |
| typedef std::unordered_map< ID, ID > | IDMap |
Public Member Functions | |
| Schedule ()=default | |
| Schedule (const Stmt &ast, int verbose=0) | |
| Schedule (const Func &func, int verbose=0) | |
| Schedule (const Schedule &)=default | |
| Schedule & | operator= (const Schedule &)=default |
| Schedule | fork () const |
| Func | func () const |
| const Stmt & | ast () const |
| const ScheduleLog & | logs () const |
| int | verbose () const |
| template<class T > | |
| std::vector< Stmt > | findAll (const T &filter) const |
| template<class T > | |
| std::vector< Stmt > | findAtLeastOne (const T &filter) const |
| template<class T > | |
| Stmt | find (const T &filter) const |
| std::pair< ID, ID > | split (const ID &id, int factor=-1, int nparts=-1, int shift=0) |
| void | reorder (const std::vector< ID > &order, ReorderMode mode=ReorderMode::PerfectOnly) |
| ID | merge (const ID &loop1, const ID &loop2) |
| std::vector< ID > | permute (const std::vector< ID > &loopsId, const std::function< std::vector< Expr >(std::vector< Expr >)> &transformFunc) |
| std::pair< IDMap, IDMap > | fission (const ID &loop, FissionSide side, const ID &splitter, bool allowEnlarge=true, const std::string &suffix0=".0", const std::string &suffix1=".1") |
| void | swap (const std::vector< ID > &order) |
| void | blend (const ID &loop) |
| std::tuple< ID, ID, std::string, ID > | cache (const ID &stmt, const std::string &var, MemType mtype) |
| std::tuple< ID, ID, std::string, ID > | cacheReduction (const ID &stmt, const std::string &var, MemType mtype) |
| void | varSplit (const ID &def, int dim, VarSplitMode mode, int factor=-1, int nparts=-1) |
| void | varMerge (const ID &def, int dim) |
| void | varReorder (const ID &def, const std::vector< int > &order) |
| void | varUnsqueeze (const ID &def, int dim) |
| void | varSqueeze (const ID &def, int dim) |
| std::pair< ID, ID > | moveTo (const ID &stmt, MoveToSide side, const ID &dst) |
| void | inlining (const ID &def) |
| void | parallelize (const ID &loop, const ParallelScope ¶llel, bool allowReduction=true) |
| void | parallelizeAs (const ID &nest, const ID &reference, const ID &defId) |
| void | unroll (const ID &loop, bool immediate=false) |
| void | vectorize (const ID &loop) |
| void | separateTail (bool noDuplicateVarDefs=false) |
| void | asMatMul (const ID &loop, AsMatMulMode mode, const Ref< Target > &target, MatMulBackend backend) |
| void | asMatMul (const ID &loop, AsMatMulMode mode, const Ref< Target > &target) |
| void | asMatMul (const ID &loop, AsMatMulMode mode=AsMatMulMode::KeepMemLayout) |
| std::pair< ID, int > | plutoFuse (const ID &loop0, const ID &loop1, int nestLevel0=0, int nestLevel1=0, int fusableOverlapThreshold=1, int fusableNonOverlapTolerance=4, bool doSimplify=true) |
| std::pair< ID, int > | plutoPermute (const ID &loop, int nestLevel=0, bool doSimplify=true) |
| void | autoSchedule (const Ref< Target > &target, const Ref< RandTrace > &trace=nullptr) |
| void | autoInline (const Ref< Target > &target) |
| void | autoUseLib (const Ref< Target > &target) |
| void | autoReorder (const Ref< Target > &target) |
| void | autoSwap (const Ref< Target > &target) |
| void | autoPluto (const Ref< Target > &target) |
| void | autoFissionFuse (const Ref< Target > &target, const Ref< RandTrace > &trace=nullptr) |
| void | autoMemLayout (const Ref< Target > &target) |
| void | autoParallelize (const Ref< Target > &target) |
| void | autoSetMemType (const Ref< Target > &target) |
| void | autoUnroll (const Ref< Target > &target) |
| std::vector< AutoScheduleTuneTrial > | tuneAutoSchedule (int nBatch, int batchSize, const Ref< Device > &device, const std::vector< Ref< Array > > &args, const std::unordered_map< std::string, Ref< Array > > &kws={}, const std::regex &toLearn=std::regex{".*"}) |
| void | beginTransaction () |
| void | commitTransaction () |
| void | abortTransaction () |
| ID | fuse (const ID &loop0, const ID &loop1, bool strict=false) |
| ID | fuse (const ID &loop0, bool strict=false) |
| void | setMemType (const ID &def, MemType mtype) |
| void | setMemType (const ID &def, MemType mtype, bool rejectIndirectAccess) |
| typedef std::unordered_map<ID, ID> freetensor::Schedule::IDMap |
|
default |
| freetensor::Schedule::Schedule | ( | const Stmt & | ast, |
| int | verbose = 0 |
||
| ) |
|
inline |
|
default |
| void freetensor::Schedule::abortTransaction | ( | ) |
| void freetensor::Schedule::asMatMul | ( | const ID & | loop, |
| AsMatMulMode | mode, | ||
| const Ref< Target > & | target | ||
| ) |
| void freetensor::Schedule::asMatMul | ( | const ID & | loop, |
| AsMatMulMode | mode, | ||
| const Ref< Target > & | target, | ||
| MatMulBackend | backend | ||
| ) |
Transform nested loops to be a external call to a matrix multiplication
| loop | ID of the loop |
| mode | : What to do if the memory layout does not meet the requirement from the external library. KeepMemLayout => Raise an exception. TryVarReorder => try var_reorder on some variables, but may affect performance of other use of these variable. TryTranspose => try cache and then var_reorder on some variables, but will incur extra overhead. |
| target | : Hardware target. If omitted, use the default target in Config, or the target set by with scopes. |
| backend | : Backend library. Defaults to Mkl for CPU targets, Cublas for GPU targets. |
| InvalidSchedule | if the loop cannot be transformed to be a matrix multiplication |
| void freetensor::Schedule::asMatMul | ( | const ID & | loop, |
| AsMatMulMode | mode = AsMatMulMode::KeepMemLayout |
||
| ) |
| const Stmt & freetensor::Schedule::ast | ( | ) | const |
| void freetensor::Schedule::autoFissionFuse | ( | const Ref< Target > & | target, |
| const Ref< RandTrace > & | trace = nullptr |
||
| ) |
(Experimental) Automatically fuse consecutive loops or vice versa using some heuristics
| target | : Target architecture |
| trace | : Random decision tarce |
(Experimental) Automatically inline very-small VarDef nodes
| target | : Target architecture |
(Experimental) Automatically adjust memory layout of variables
| target | : Target architecture |
(Experimental) Automatically parallelize some loops using some heuristics
| target | : Target architecture |
IV. Recurse into sub-loops if failed
(Experimental) Automatically apply pluto-based schedules
| target | : Target architecture |
(Experimental) Automaticaly reorder loops in a loop nest
| target | : Target architecture |
| void freetensor::Schedule::autoSchedule | ( | const Ref< Target > & | target, |
| const Ref< RandTrace > & | trace = nullptr |
||
| ) |
(Experimental) Automatic scheduling using some heuristics
| target | : Target architecture |
| trace | : Random decision tarce |
(Experimental) Automatically set memory types using some heuristics
| target | : Target architecture |
(Experimental) Automatically swap statements to enable more fission or fusion
| target | : Target architecture |
(Experimental) Automatically unroll loops using some heuristics
| target | : Target architecture |
(Experimental) Automatically use external libs using some heuristics
| target | : Target architecture |
| void freetensor::Schedule::beginTransaction | ( | ) |
Transaction of schedules
Schedules are applied in transactions. A transaction is created with beginTransaction(), applied as a whole with commitTransaction(), and can be aborted with abortTransaction()
Transactions can be nested. Technically, each schedule is by itself a inner-most transaction, while a Schedule object defines the outer-most transaction, but these inner-most and outer-most transcations are invisible to users
| void freetensor::Schedule::blend | ( | const ID & | loop | ) |
Unroll a loop and interleave statements from each iteration
E.g.
will be transformed to be
Virtual threads in TVM can be implemented via blend
| loop | : ID of the loop being transformed |
| InvalidSchedule | if the loop is not found, the loop length is not a constant, or the dependences cannot be solved |
| std::tuple< ID, ID, std::string, ID > freetensor::Schedule::cache | ( | const ID & | stmt, |
| const std::string & | var, | ||
| MemType | mtype | ||
| ) |
Cache a variable into a new local variable
All needed data will be filled into the cache first, then all reads and writes will be directed to the cache, and finally all needed data will be flushed from the cache
Note for reduction: This transformation preserves the computation order. It will transform
to
If you need a "real" cache for reduction, which reorders the computation, use cache_reduction instead
| stmt | : ID of the statement or block (e.g. an If or a For) to be modified |
| var | : name of the variable to be cached |
| mtype | : where to cache |
| InvalidSchedule | if the ID or name is not found |
| std::tuple< ID, ID, std::string, ID > freetensor::Schedule::cacheReduction | ( | const ID & | stmt, |
| const std::string & | var, | ||
| MemType | mtype | ||
| ) |
Perform local reductions (e.g. sum) in a local variable first, and then reduce the local result to the global variable
E.g.
will be transformed to be
| stmt | : ID of the statement or block (e.g. an If or a For) to be modified |
| var | : name of the variable to be cached. Only reductions are allowed on var in stmt. Plain reads or writes are not allowed |
| mtype | : where to cache |
| InvalidSchedule | if the ID or name is not found, or there are unsupported reads or writes |
| void freetensor::Schedule::commitTransaction | ( | ) |
|
inline |
Find the only one nodes in the current AST satisfying a given condition
| InvalidSchedule | : if there is more than one, or there is no node found |
|
inline |
|
inline |
| std::pair< Schedule::IDMap, Schedule::IDMap > freetensor::Schedule::fission | ( | const ID & | loop, |
| FissionSide | side, | ||
| const ID & | splitter, | ||
| bool | allowEnlarge = true, |
||
| const std::string & | suffix0 = ".0", |
||
| const std::string & | suffix1 = ".1" |
||
| ) |
Fission a loop into two loops each containing part of the statements, one followed by another
To split loop into two nested loops, use split instead
Statements inside the original loop will be distributed to one or both (happening if they are scope statements) loops. If a statement is originally labeled "S", it can be selected by "$fission.0{S}" (from the first loop) or "$fission.1{S}" (from the second loop) after fission. If one of the resulting loop has an empty body, it will be removed
| loop | : ID of the loop to be fissioned |
| side | : If After, splitter is the last statement of the first loop. If Before, splitter is the first statement of the second loop |
| splitter | : Where to fission the loop |
| allowEnlarge | : If true, try to avoid dependence by enlarging some VarDef nodes. If false, throw InvalidSchedule in such cases. |
| suffix0 | : The suffix in the op of metadata of result part 0. If empty, the fissioned part 0 preserves original ID and metadata. Cannot be empty together with suffix1. |
| suffix1 | : The suffix in the op of metadata of result part 1. If empty, the fissioned part 1 preserves original ID and metadata. Cannot be empty together with suffix0. |
| InvalidSchedule | if any dependence cannot be resolved |
|
inline |
|
inline |
Fuse two directly following loops with the same length into one
To merge nested loops into one, use merge instead
parallelize, unroll and vectorize properties will be reset on the fused loop
Suppose the original loops are labeled "L1" and "L2", the fused loop can be selected by "$fuse{L1, L2}"
| loop0 | : ID of the leading loop |
| loop1 | : ID of the following loop. If omitted, it will try to find a following loop of loop0 |
| strict | : If true, throw an error if unable to determine whether the two loops are of the same length |
| InvalidSchedule | if the two loops are not directly following, the two loops are not of the same length, or there is any dependence cannot be resolved |
| void freetensor::Schedule::inlining | ( | const ID & | def | ) |
Remove a variable. When the variable is used, recompute its value
| def | : ID of the VarDef statement of the specific variable. It can not be an I/O varible |
| InvalidSchedule | if the variable cannot be completely removed |
| const ScheduleLog & freetensor::Schedule::logs | ( | ) | const |
Merge two directly nested loops into one
To fuse consecutive loops, use fuse instead
parallelize, unroll and vectorize properties will be reset on the merged loop
Suppose the original loops are labeled "L1" and "L2", the merged loop can be selected by "$merge{L1, L2}"
| loop1,loop2 | : ID of the loops to be merged, can be in any order |
| InvalidSchedule | if the loops are not directly nested |
| std::pair< ID, ID > freetensor::Schedule::moveTo | ( | const ID & | stmt, |
| MoveToSide | side, | ||
| const ID & | dst | ||
| ) |
Move a statement to a new position
This is a composite schedule command, which is implemented with other commands
If moving a statement out of some loops, identical loops will be added around the moved statement, which is equivalent to fission these loops
| stmt | : ID of the statement to be moved |
| side | : Whether stmt will be BEFORE or AFTER dst @param dst : Insertstmt` to be directly after this statement |
| InvalidSchedule | if there is no feasible path to move |
| void freetensor::Schedule::parallelize | ( | const ID & | loop, |
| const ParallelScope & | parallel, | ||
| bool | allowReduction = true |
||
| ) |
Mark a loop with a parallel implementation
This schedule follows a fork-join model: multiple workers (abstract threads) are created (but physically the threads may be cached in a thread pool) when the loop begins, do their jobs in parallel, and join when the loop ends
OpenMP threads follow a typical fork-join model. CUDA threads run in a bulk-synchronous parallel (BSP) model, which can also be mimiked by the fork-join model: All threads start when the kernel get launched, but they only begin to do their jobs when the parallel loop begins. Nevertheless, the fork-join model needs the following extension to fully mimic a BSP model:
Taking CUDA as an example, we allow binding a loop to threadIdx.x inside another loop bound to threadIdx.x, which is illegal in a classic fork-join model. For example, we may implement a matmul with collaborative fetch as below:
A seemingly plausible solution to avoid this extension is to reorder Lk0 to outer-most, and then move Lk1_a and Lk1_b out of Li or Lj. This resolves the nested threadIdx.x and threadIdx.y binding problem by running Li+Lk1_a, Lj+Lk1_b and Li+Lj interleavingly, instead of running Lk1_a and Lk1_b inside Li+Lj. However, this approach is illegal, because the local variable local_sum can no longer be kept inside the body of Li and Lj: It has to be reused across multiple runs of Li and Lj
Please also note that we can bind one threadIdx.x to two loops only when the body statement is loop-invariant to one of them. For example, the following binding is still illegal, even in our extended fork-join model, because it violates its serial semantics:
| loop | : ID of the loop |
| parallel | : Parallel scope |
| allowReduction | : If false, throw InvalidSchedule if this schedule would introduce a parallel reduction |
| InvalidSchedule | if the loop is not found or unable to be parallelized |
| void freetensor::Schedule::parallelizeAs | ( | const ID & | nest, |
| const ID & | reference, | ||
| const ID & | defId | ||
| ) |
Parallelize a loop nest according to another loop nest to keep a tensor thread-local
| nest | : ID of the loop nest to be parallelized. The ID can be of any statement type, and all statements it contains will be parallelized. |
| reference | ID of the loop nest to be referenced. The ID can be of any statement type, and all statements it contains will be referenced. |
| defId | : ID of the VarDef statement of the tensor to be kept thread-local. |
| InvalidSchedule | if any of the ID is not found, or the reference loop nest is already thread-non-local. |
| std::vector< ID > freetensor::Schedule::permute | ( | const std::vector< ID > & | loopsId, |
| const std::function< std::vector< Expr >(std::vector< Expr >)> & | transformFunc | ||
| ) |
Permute perfectly nested loops (directly nested loops without statements in between) with the given loop space transformation function
The transformed loops follow ascending lexical order of the transformed terms returned by transformFunc when called with original iteration variables
| loopsId | : the list of IDs of perfectly nested loops to be permuted |
| transformFunc | : the loop space transformation function, should be bijective |
| InvalidSchedule | if the loops are not perfectly nested, or the permutation is not bijective, or the permutation breaks certain dependence |
FIXME: put this into schedule logs
| std::pair< ID, int > freetensor::Schedule::plutoFuse | ( | const ID & | loop0, |
| const ID & | loop1, | ||
| int | nestLevel0 = 0, |
||
| int | nestLevel1 = 0, |
||
| int | fusableOverlapThreshold = 1, |
||
| int | fusableNonOverlapTolerance = 4, |
||
| bool | doSimplify = true |
||
| ) |
Use Pluto+ algorithm to permute and fuse two loops, with as most parallelizable loops as possible at outermost levels. The two loops are required to be consequent; all directly nested levels are detected and subject to permutation. Remaining levels that cannot be fused are left inside the fused loops as two statements
| loop0 | : The first loop to fuse |
| loop1 | : The second loop to fuse |
| nestLevel0 | : The number of nesting levels of loop 0 to be considered, defaults to maximum possible |
| nestLevel1 | : The number of nesting levels of loop 1 to be considered, defaults to maximum possible |
| fusableOverlapThreshold | : The minimum overlapping size of two loops to be regarded fusable. Defaults to 1 |
| fusableNonOverlapTolerance | : The maximum non-overlapping size at either side of two loops to be regarded fusable. Defaults to 4 |
| doSimplify | : Whether the result is simplified by the way, defaults to true |
| std::pair< ID, int > freetensor::Schedule::plutoPermute | ( | const ID & | loop, |
| int | nestLevel = 0, |
||
| bool | doSimplify = true |
||
| ) |
Use Pluto+ algorithm to permute a single loop, with as most parallelizable loops as possible at outermost levels.
| loop | : The loop to permute |
| nestLevel0 | : The number of nesting levels to be considered, defaults to maximum possible |
| doSimplify | : Whether the result is simplified by the way, defaults to true |
| void freetensor::Schedule::reorder | ( | const std::vector< ID > & | order, |
| ReorderMode | mode = ReorderMode::PerfectOnly |
||
| ) |
Reorder directly nested loops
To swap consecutive loops, use swap instead
| order | : Vector of loop IDs. The requested order of the loops |
| mode | : How to deal with imperfectly nested loops. PerfectOnly => throw an exception. MoveOutImperfect => do fission in advance to move out statements between the loops, which may enlarge intermediate tensors. MoveInImperfect => move statements between the loops inwards after adding gurads them them, which may hurt parallelism |
| InvalidSchedule | if the input is invalid or there are breaking dependences |
| void freetensor::Schedule::separateTail | ( | bool | noDuplicateVarDefs = false | ) |
Seperate main iterations and tail iterations of a loop
E.g.
Each loop will be separated into 2 parts: the body and the tail. After simplification, the program will finally be transformed to
Ideally, all programs can benefit from this schedule. However, this schedule may greatly increase the program size and make the compiling time way too long. Therefore, this transformation is implemented as a schedule, which can be applied optionally. (TODO: Optionally apply this schedule to part of the program)
| noDuplicateVarDefs | : If there is two VarDef nodes in two branches, it may result in doubled memory use, since different thread may go to different branch. Set this parameter to true to stop duplicating VarDef nodes. |
Change where a variable is stored
| def | : ID of the VarDef statement of the specific variable |
| mtype | : Where the variable should be stored |
| rejectIndirectAccess | : Registers usually do not support indirect access. If a variable is accessed indirectly, setting it to use registers is meaningless even successful. If this parameter is set to true, throw an exception if the variable being set is accessed indirectly. Specifically, two types of access are considered indirect: 1) The index is a load from another variable, or 2) The index is a loop iterator and the loop has a dynamic length (which can not be unrolled by a backend compiler). By default, this parameter is determined automatically by mtype. |
| InvalidSchedule | if the variable is not found, or if rejecting an indirect access |
| std::pair< ID, ID > freetensor::Schedule::split | ( | const ID & | id, |
| int | factor = -1, |
||
| int | nparts = -1, |
||
| int | shift = 0 |
||
| ) |
Split a loop into two nested loops
To fission a loop into two consecutive loops, use fission instead
Two modes are provided:
factor and leave nparts to -1. It will result in an outer loop with length ceil(n / factor), and an inner loop with length factor, where n is the original loop length added by shift. The original iterator i will be transformed to i0 * factor + i1, where i0 and i1 are the iterators of the new outer and inner loops, respectivelynparts and leave factor to -1. It will result in an outer loop with length nparts, and an inner loop with length ceil(n / nparts), where n is the original loop length added by shift. The original iterator i will be transformed to i0 * ceil(n / nparts) + i1, where i0 and i1 are the iterators of the new outer and inner loops, respectivelyPlease note that the second mode will introduce an i0 * ceil(n / nparts) factor into the program, which cannot be recognized by polyhedral analysis, which may hinder some following schedules. If possible, please use the first mode, and then reorder the inner and outer loops
Suppose the original loop is labeled "L", the split two loops can be selected by "$split.0{L}" (the outer loop) and "$split.1{L}" (the inner loop). If one of the resulting loop is proved to have only a single iteration, it will be removed
| id | : ID of the loop to be split |
| factor | : Length of the inner loop. Set to -1 if using nparts |
| nparts | : Length of the outer loop. Set to -1 if using factor |
| shift | : Shift of iteration base. Defaults to zero |
| InvalidSchedule | if the loop is not found |
| void freetensor::Schedule::swap | ( | const std::vector< ID > & | order | ) |
Swap statements in the same block
To reorder nested loops, use reorder instead
| order | : list of IDs of the statements |
| InvalidSchedule | if the statements are not found or the dependences cannot be solved |
| std::vector< AutoScheduleTuneTrial > freetensor::Schedule::tuneAutoSchedule | ( | int | nBatch, |
| int | batchSize, | ||
| const Ref< Device > & | device, | ||
| const std::vector< Ref< Array > > & | args, | ||
| const std::unordered_map< std::string, Ref< Array > > & | kws = {}, |
||
| const std::regex & | toLearn = std::regex{".*"} |
||
| ) |
| void freetensor::Schedule::unroll | ( | const ID & | loop, |
| bool | immediate = false |
||
| ) |
Unroll a loop
| loop | : ID of the loop |
| immediate | : If false (by default), postpone the unroll procedure to the backend compiler, which saves scheduling time. If true, unroll the loop immediately, which may help further simplifications based on the unrolled result. If your purpose is just to fill the instruction cache, set it to false. If you are unrolling a loop that computes array indices, set it to true |
| InvalidSchedule | if the loop is not found or length of the loop is not a constant |
| void freetensor::Schedule::varMerge | ( | const ID & | def, |
| int | dim | ||
| ) |
Merge two dimensions of a variable
| def | : ID of the VarDef statement of the specific variable |
| dim | : Merge the dim-th and the (dim + 1)-th dimension |
| void freetensor::Schedule::varReorder | ( | const ID & | def, |
| const std::vector< int > & | order | ||
| ) |
Reorder the dimensions of a variable
| def | : ID of the VarDef statement of the specific variable |
| order | : new order of the dimensions |
| InvalidSchedule | if the variable or the order is illegal |
| void freetensor::Schedule::varSplit | ( | const ID & | def, |
| int | dim, | ||
| VarSplitMode | mode, | ||
| int | factor = -1, |
||
| int | nparts = -1 |
||
| ) |
Split a dimension of a variable into two
| def | : ID of the VarDef statement of the specific variable |
| dim | : which dimension to be split |
| mode | : When the dimension to split is not divisible by factor or nparts, the resulting shape may become larger. In FixedSize mode, the actual buffer size will not be changed, and gurads will be added to prevent out-of-bound accesses. In RelaxedSize mode, the buffer size may increase. The RelaxedSize mode cannot be applied to I/O variables |
| factor | : Length of the inner (higher no.) dimension. Set to -1 if using nparts |
| nparts | : Length of the outer (lower no.) loop. Set to -1 if using factor |
| InvalidSchedule | if the variable or the dimension is not found |
| void freetensor::Schedule::varSqueeze | ( | const ID & | def, |
| int | dim | ||
| ) |
Remove a singleton (1-lengthed) dimension from a variable
This is a utility schedule, which can be used together with varSplit, varMerge and/or varReorder to transform a variable to a desired shape.
| def | : ID of the VarDef statement of the specific variable |
| dim | : Remove the dim-th dimension |
| InvalidSchedule | if the variable is not found or the dimension is illegal |
| void freetensor::Schedule::varUnsqueeze | ( | const ID & | def, |
| int | dim | ||
| ) |
Insert a singleton (1-lengthed) dimension to a variable
This is a utility schedule, which can be used together with varSplit, varMerge and/or varReorder to transform a variable to a desired shape.
| def | : ID of the VarDef statement of the specific variable |
| dim | : Insert a singleton dimension at the dim-th dimension |
| InvalidSchedule | if the variable is not found or the dimension is illegal |
| void freetensor::Schedule::vectorize | ( | const ID & | loop | ) |
Vectorize a loop
Please note that, as vectorization is different from architecture to achitecture, the scheduler may or may not postpone it to the backend compiler. The vectorization is a best-effort schedule
| loop | : ID of the loop |
| InvalidSchedule | if the ID or name is not found, or the dependence requirement is not met |
|
inline |
Verbose level