FreeTensor
Loading...
Searching...
No Matches
Classes | Public Types | Public Member Functions | List of all members
freetensor::Schedule Class Reference

#include <schedule.h>

Public Types

typedef std::unordered_map< ID, IDIDMap
 

Public Member Functions

 Schedule ()=default
 
 Schedule (const Stmt &ast, int verbose=0)
 
 Schedule (const Func &func, int verbose=0)
 
 Schedule (const Schedule &)=default
 
Scheduleoperator= (const Schedule &)=default
 
Schedule fork () const
 
Func func () const
 
const Stmtast () const
 
const ScheduleLoglogs () const
 
int verbose () const
 
template<class T >
std::vector< StmtfindAll (const T &filter) const
 
template<class T >
std::vector< StmtfindAtLeastOne (const T &filter) const
 
template<class T >
Stmt find (const T &filter) const
 
std::pair< ID, IDsplit (const ID &id, int factor=-1, int nparts=-1, int shift=0)
 
void reorder (const std::vector< ID > &order, ReorderMode mode=ReorderMode::PerfectOnly)
 
ID merge (const ID &loop1, const ID &loop2)
 
std::vector< IDpermute (const std::vector< ID > &loopsId, const std::function< std::vector< Expr >(std::vector< Expr >)> &transformFunc)
 
std::pair< IDMap, IDMapfission (const ID &loop, FissionSide side, const ID &splitter, bool allowEnlarge=true, const std::string &suffix0=".0", const std::string &suffix1=".1")
 
void swap (const std::vector< ID > &order)
 
void blend (const ID &loop)
 
std::tuple< ID, ID, std::string, IDcache (const ID &stmt, const std::string &var, MemType mtype)
 
std::tuple< ID, ID, std::string, IDcacheReduction (const ID &stmt, const std::string &var, MemType mtype)
 
void varSplit (const ID &def, int dim, VarSplitMode mode, int factor=-1, int nparts=-1)
 
void varMerge (const ID &def, int dim)
 
void varReorder (const ID &def, const std::vector< int > &order)
 
void varUnsqueeze (const ID &def, int dim)
 
void varSqueeze (const ID &def, int dim)
 
std::pair< ID, IDmoveTo (const ID &stmt, MoveToSide side, const ID &dst)
 
void inlining (const ID &def)
 
void parallelize (const ID &loop, const ParallelScope &parallel, bool allowReduction=true)
 
void parallelizeAs (const ID &nest, const ID &reference, const ID &defId)
 
void unroll (const ID &loop, bool immediate=false)
 
void vectorize (const ID &loop)
 
void separateTail (bool noDuplicateVarDefs=false)
 
void asMatMul (const ID &loop, AsMatMulMode mode, const Ref< Target > &target, MatMulBackend backend)
 
void asMatMul (const ID &loop, AsMatMulMode mode, const Ref< Target > &target)
 
void asMatMul (const ID &loop, AsMatMulMode mode=AsMatMulMode::KeepMemLayout)
 
std::pair< ID, int > plutoFuse (const ID &loop0, const ID &loop1, int nestLevel0=0, int nestLevel1=0, int fusableOverlapThreshold=1, int fusableNonOverlapTolerance=4, bool doSimplify=true)
 
std::pair< ID, int > plutoPermute (const ID &loop, int nestLevel=0, bool doSimplify=true)
 
void autoSchedule (const Ref< Target > &target, const Ref< RandTrace > &trace=nullptr)
 
void autoInline (const Ref< Target > &target)
 
void autoUseLib (const Ref< Target > &target)
 
void autoReorder (const Ref< Target > &target)
 
void autoSwap (const Ref< Target > &target)
 
void autoPluto (const Ref< Target > &target)
 
void autoFissionFuse (const Ref< Target > &target, const Ref< RandTrace > &trace=nullptr)
 
void autoMemLayout (const Ref< Target > &target)
 
void autoParallelize (const Ref< Target > &target)
 
void autoSetMemType (const Ref< Target > &target)
 
void autoUnroll (const Ref< Target > &target)
 
std::vector< AutoScheduleTuneTrialtuneAutoSchedule (int nBatch, int batchSize, const Ref< Device > &device, const std::vector< Ref< Array > > &args, const std::unordered_map< std::string, Ref< Array > > &kws={}, const std::regex &toLearn=std::regex{".*"})
 
void beginTransaction ()
 
void commitTransaction ()
 
void abortTransaction ()
 
ID fuse (const ID &loop0, const ID &loop1, bool strict=false)
 
ID fuse (const ID &loop0, bool strict=false)
 
void setMemType (const ID &def, MemType mtype)
 
void setMemType (const ID &def, MemType mtype, bool rejectIndirectAccess)
 

Member Typedef Documentation

◆ IDMap

typedef std::unordered_map<ID, ID> freetensor::Schedule::IDMap

Constructor & Destructor Documentation

◆ Schedule() [1/4]

freetensor::Schedule::Schedule ( )
default

◆ Schedule() [2/4]

freetensor::Schedule::Schedule ( const Stmt ast,
int  verbose = 0 
)

◆ Schedule() [3/4]

freetensor::Schedule::Schedule ( const Func func,
int  verbose = 0 
)
inline

◆ Schedule() [4/4]

freetensor::Schedule::Schedule ( const Schedule )
default

Member Function Documentation

◆ abortTransaction()

void freetensor::Schedule::abortTransaction ( )

◆ asMatMul() [1/3]

void freetensor::Schedule::asMatMul ( const ID loop,
AsMatMulMode  mode,
const Ref< Target > &  target 
)

◆ asMatMul() [2/3]

void freetensor::Schedule::asMatMul ( const ID loop,
AsMatMulMode  mode,
const Ref< Target > &  target,
MatMulBackend  backend 
)

Transform nested loops to be a external call to a matrix multiplication

Parameters
loopID of the loop
mode: What to do if the memory layout does not meet the requirement from the external library. KeepMemLayout => Raise an exception. TryVarReorder => try var_reorder on some variables, but may affect performance of other use of these variable. TryTranspose => try cache and then var_reorder on some variables, but will incur extra overhead.
target: Hardware target. If omitted, use the default target in Config, or the target set by with scopes.
backend: Backend library. Defaults to Mkl for CPU targets, Cublas for GPU targets.
Exceptions
InvalidScheduleif the loop cannot be transformed to be a matrix multiplication

◆ asMatMul() [3/3]

void freetensor::Schedule::asMatMul ( const ID loop,
AsMatMulMode  mode = AsMatMulMode::KeepMemLayout 
)

◆ ast()

const Stmt & freetensor::Schedule::ast ( ) const
Returns
: The statements being transformed, without a function signature

◆ autoFissionFuse()

void freetensor::Schedule::autoFissionFuse ( const Ref< Target > &  target,
const Ref< RandTrace > &  trace = nullptr 
)

(Experimental) Automatically fuse consecutive loops or vice versa using some heuristics

Parameters
target: Target architecture
trace: Random decision tarce

◆ autoInline()

void freetensor::Schedule::autoInline ( const Ref< Target > &  target)

(Experimental) Automatically inline very-small VarDef nodes

Parameters
target: Target architecture

◆ autoMemLayout()

void freetensor::Schedule::autoMemLayout ( const Ref< Target > &  target)

(Experimental) Automatically adjust memory layout of variables

Parameters
target: Target architecture

◆ autoParallelize()

void freetensor::Schedule::autoParallelize ( const Ref< Target > &  target)

(Experimental) Automatically parallelize some loops using some heuristics

Parameters
target: Target architecture

IV. Recurse into sub-loops if failed

◆ autoPluto()

void freetensor::Schedule::autoPluto ( const Ref< Target > &  target)

(Experimental) Automatically apply pluto-based schedules

Parameters
target: Target architecture

◆ autoReorder()

void freetensor::Schedule::autoReorder ( const Ref< Target > &  target)

(Experimental) Automaticaly reorder loops in a loop nest

Parameters
target: Target architecture

◆ autoSchedule()

void freetensor::Schedule::autoSchedule ( const Ref< Target > &  target,
const Ref< RandTrace > &  trace = nullptr 
)

(Experimental) Automatic scheduling using some heuristics

Parameters
target: Target architecture
trace: Random decision tarce

◆ autoSetMemType()

void freetensor::Schedule::autoSetMemType ( const Ref< Target > &  target)

(Experimental) Automatically set memory types using some heuristics

Parameters
target: Target architecture

◆ autoSwap()

void freetensor::Schedule::autoSwap ( const Ref< Target > &  target)

(Experimental) Automatically swap statements to enable more fission or fusion

Parameters
target: Target architecture

◆ autoUnroll()

void freetensor::Schedule::autoUnroll ( const Ref< Target > &  target)

(Experimental) Automatically unroll loops using some heuristics

Parameters
target: Target architecture

◆ autoUseLib()

void freetensor::Schedule::autoUseLib ( const Ref< Target > &  target)

(Experimental) Automatically use external libs using some heuristics

Parameters
target: Target architecture

◆ beginTransaction()

void freetensor::Schedule::beginTransaction ( )

Transaction of schedules

Schedules are applied in transactions. A transaction is created with beginTransaction(), applied as a whole with commitTransaction(), and can be aborted with abortTransaction()

Transactions can be nested. Technically, each schedule is by itself a inner-most transaction, while a Schedule object defines the outer-most transaction, but these inner-most and outer-most transcations are invisible to users

◆ blend()

void freetensor::Schedule::blend ( const ID loop)

Unroll a loop and interleave statements from each iteration

E.g.

for i = 0 to 2 {
f(i);
g(i);
}

will be transformed to be

f(0);
f(1);
g(0);
g(1);

Virtual threads in TVM can be implemented via blend

Parameters
loop: ID of the loop being transformed
Exceptions
InvalidScheduleif the loop is not found, the loop length is not a constant, or the dependences cannot be solved

◆ cache()

std::tuple< ID, ID, std::string, ID > freetensor::Schedule::cache ( const ID stmt,
const std::string &  var,
MemType  mtype 
)

Cache a variable into a new local variable

All needed data will be filled into the cache first, then all reads and writes will be directed to the cache, and finally all needed data will be flushed from the cache

Note for reduction: This transformation preserves the computation order. It will transform

a += x
a += y

to

a.cache = a + x + y
a = a.cache

If you need a "real" cache for reduction, which reorders the computation, use cache_reduction instead

Parameters
stmt: ID of the statement or block (e.g. an If or a For) to be modified
var: name of the variable to be cached
mtype: where to cache
Exceptions
InvalidScheduleif the ID or name is not found
Returns
: (ID of the statement that fills the cache, ID of the statement that flushes from the cache, name of the cache variable, ID of the VarDef node of the cache variable)

◆ cacheReduction()

std::tuple< ID, ID, std::string, ID > freetensor::Schedule::cacheReduction ( const ID stmt,
const std::string &  var,
MemType  mtype 
)

Perform local reductions (e.g. sum) in a local variable first, and then reduce the local result to the global variable

E.g.

a += x
a += y

will be transformed to be

a.cache = x + y
a += a.cache
Parameters
stmt: ID of the statement or block (e.g. an If or a For) to be modified
var: name of the variable to be cached. Only reductions are allowed on var in stmt. Plain reads or writes are not allowed
mtype: where to cache
Exceptions
InvalidScheduleif the ID or name is not found, or there are unsupported reads or writes
Returns
: (ID of the statement that initialize the cache, ID of the statement that reduces the local result to the global result, name of the cache variable, ID of the VarDef node of the cache variable)

◆ commitTransaction()

void freetensor::Schedule::commitTransaction ( )

◆ find()

template<class T >
Stmt freetensor::Schedule::find ( const T &  filter) const
inline

Find the only one nodes in the current AST satisfying a given condition

Parameters
filter: A callback that returns true for acceptance, or a Selector, or an ID
Exceptions
InvalidSchedule: if there is more than one, or there is no node found

◆ findAll()

template<class T >
std::vector< Stmt > freetensor::Schedule::findAll ( const T &  filter) const
inline

Find all nodes (maybe non-existing) in the current AST satisfying a given condition

Parameters
filter: A callback that returns true for acceptance, or a Selector, or an ID
Returns
: All statements satisfying the given condition, in DFS pre order

◆ findAtLeastOne()

template<class T >
std::vector< Stmt > freetensor::Schedule::findAtLeastOne ( const T &  filter) const
inline

Find all nodes (at least one) in the current AST satisfying a given condition

Parameters
filter: A callback that returns true for acceptance, or a Selector, or an ID

◆ fission()

std::pair< Schedule::IDMap, Schedule::IDMap > freetensor::Schedule::fission ( const ID loop,
FissionSide  side,
const ID splitter,
bool  allowEnlarge = true,
const std::string &  suffix0 = ".0",
const std::string &  suffix1 = ".1" 
)

Fission a loop into two loops each containing part of the statements, one followed by another

To split loop into two nested loops, use split instead

Statements inside the original loop will be distributed to one or both (happening if they are scope statements) loops. If a statement is originally labeled "S", it can be selected by "$fission.0{S}" (from the first loop) or "$fission.1{S}" (from the second loop) after fission. If one of the resulting loop has an empty body, it will be removed

Parameters
loop: ID of the loop to be fissioned
side: If After, splitter is the last statement of the first loop. If Before, splitter is the first statement of the second loop
splitter: Where to fission the loop
allowEnlarge: If true, try to avoid dependence by enlarging some VarDef nodes. If false, throw InvalidSchedule in such cases.
suffix0: The suffix in the op of metadata of result part 0. If empty, the fissioned part 0 preserves original ID and metadata. Cannot be empty together with suffix1.
suffix1: The suffix in the op of metadata of result part 1. If empty, the fissioned part 1 preserves original ID and metadata. Cannot be empty together with suffix0.
Exceptions
InvalidScheduleif any dependence cannot be resolved
Returns
: ({old ID -> new ID in 1st loop}, {old ID -> new ID in 2nd loop}). If a loop is removed because it has an empty body, it will not be in the returned map

◆ fork()

Schedule freetensor::Schedule::fork ( ) const
inline

Copy the Schedule object for trying different scheduling decisions in the future

The forked object shares the same MemoizedSchedule with the original one, so common decisions can be saved and reused

The forked object shares the same RandCtx objects, so it can learn from multiple scheduling trials

◆ func()

Func freetensor::Schedule::func ( ) const
inline
Returns
: The function being transformed

◆ fuse() [1/2]

ID freetensor::Schedule::fuse ( const ID loop0,
bool  strict = false 
)

◆ fuse() [2/2]

ID freetensor::Schedule::fuse ( const ID loop0,
const ID loop1,
bool  strict = false 
)

Fuse two directly following loops with the same length into one

To merge nested loops into one, use merge instead

parallelize, unroll and vectorize properties will be reset on the fused loop

Suppose the original loops are labeled "L1" and "L2", the fused loop can be selected by "$fuse{L1, L2}"

Parameters
loop0: ID of the leading loop
loop1: ID of the following loop. If omitted, it will try to find a following loop of loop0
strict: If true, throw an error if unable to determine whether the two loops are of the same length
Exceptions
InvalidScheduleif the two loops are not directly following, the two loops are not of the same length, or there is any dependence cannot be resolved
Returns
: ID of the result loop

◆ inlining()

void freetensor::Schedule::inlining ( const ID def)

Remove a variable. When the variable is used, recompute its value

Parameters
def: ID of the VarDef statement of the specific variable. It can not be an I/O varible
Exceptions
InvalidScheduleif the variable cannot be completely removed

◆ logs()

const ScheduleLog & freetensor::Schedule::logs ( ) const
Returns
: Logs of all schedules applied

◆ merge()

ID freetensor::Schedule::merge ( const ID loop1,
const ID loop2 
)

Merge two directly nested loops into one

To fuse consecutive loops, use fuse instead

parallelize, unroll and vectorize properties will be reset on the merged loop

Suppose the original loops are labeled "L1" and "L2", the merged loop can be selected by "$merge{L1, L2}"

Parameters
loop1,loop2: ID of the loops to be merged, can be in any order
Exceptions
InvalidScheduleif the loops are not directly nested
Returns
: ID of the merged loop

◆ moveTo()

std::pair< ID, ID > freetensor::Schedule::moveTo ( const ID stmt,
MoveToSide  side,
const ID dst 
)

Move a statement to a new position

This is a composite schedule command, which is implemented with other commands

If moving a statement out of some loops, identical loops will be added around the moved statement, which is equivalent to fission these loops

Parameters
stmt: ID of the statement to be moved
side: Whether stmt will be BEFORE or AFTER dst @param dst : Insertstmt` to be directly after this statement
Exceptions
InvalidScheduleif there is no feasible path to move
Returns
: (The new ID of the moved statement, The out-most newly introduced statments including the added loops)

◆ operator=()

Schedule & freetensor::Schedule::operator= ( const Schedule )
default

◆ parallelize()

void freetensor::Schedule::parallelize ( const ID loop,
const ParallelScope parallel,
bool  allowReduction = true 
)

Mark a loop with a parallel implementation

This schedule follows a fork-join model: multiple workers (abstract threads) are created (but physically the threads may be cached in a thread pool) when the loop begins, do their jobs in parallel, and join when the loop ends

OpenMP threads follow a typical fork-join model. CUDA threads run in a bulk-synchronous parallel (BSP) model, which can also be mimiked by the fork-join model: All threads start when the kernel get launched, but they only begin to do their jobs when the parallel loop begins. Nevertheless, the fork-join model needs the following extension to fully mimic a BSP model:

Taking CUDA as an example, we allow binding a loop to threadIdx.x inside another loop bound to threadIdx.x, which is illegal in a classic fork-join model. For example, we may implement a matmul with collaborative fetch as below:

for i : threadIdx.x # Li
for j : threadIdx.y # Lj
local_sum = 0 # In gpu/local memory, unique to (i, j)
for k0 # Lk0
for k : threadIdx.y # Lk1_a
A_cache[k] = A[i, k] # In gpu/shared, shared by different j
for k : threadIdx.x # Lk1_b
B_cache[k] = B[k, j] # In gpu/shared, shared by different i
for k # Lk1_c
sum += A_cache[k] * B_cache[k]
C[i, j] = local_sum
PBMap sum(T &&lhs, U &&rhs)
Definition: presburger.h:959

A seemingly plausible solution to avoid this extension is to reorder Lk0 to outer-most, and then move Lk1_a and Lk1_b out of Li or Lj. This resolves the nested threadIdx.x and threadIdx.y binding problem by running Li+Lk1_a, Lj+Lk1_b and Li+Lj interleavingly, instead of running Lk1_a and Lk1_b inside Li+Lj. However, this approach is illegal, because the local variable local_sum can no longer be kept inside the body of Li and Lj: It has to be reused across multiple runs of Li and Lj

Please also note that we can bind one threadIdx.x to two loops only when the body statement is loop-invariant to one of them. For example, the following binding is still illegal, even in our extended fork-join model, because it violates its serial semantics:

for i : threadIdx.x
for j : threadIdx.x
A[i, j] ++
Parameters
loop: ID of the loop
parallel: Parallel scope
allowReduction: If false, throw InvalidSchedule if this schedule would introduce a parallel reduction
Exceptions
InvalidScheduleif the loop is not found or unable to be parallelized

◆ parallelizeAs()

void freetensor::Schedule::parallelizeAs ( const ID nest,
const ID reference,
const ID defId 
)

Parallelize a loop nest according to another loop nest to keep a tensor thread-local

Parameters
nest: ID of the loop nest to be parallelized. The ID can be of any statement type, and all statements it contains will be parallelized.
referenceID of the loop nest to be referenced. The ID can be of any statement type, and all statements it contains will be referenced.
defId: ID of the VarDef statement of the tensor to be kept thread-local.
Exceptions
InvalidScheduleif any of the ID is not found, or the reference loop nest is already thread-non-local.

◆ permute()

std::vector< ID > freetensor::Schedule::permute ( const std::vector< ID > &  loopsId,
const std::function< std::vector< Expr >(std::vector< Expr >)> &  transformFunc 
)

Permute perfectly nested loops (directly nested loops without statements in between) with the given loop space transformation function

The transformed loops follow ascending lexical order of the transformed terms returned by transformFunc when called with original iteration variables

Parameters
loopsId: the list of IDs of perfectly nested loops to be permuted
transformFunc: the loop space transformation function, should be bijective
Exceptions
InvalidScheduleif the loops are not perfectly nested, or the permutation is not bijective, or the permutation breaks certain dependence
Returns
: the list of IDs of permuted loops

FIXME: put this into schedule logs

◆ plutoFuse()

std::pair< ID, int > freetensor::Schedule::plutoFuse ( const ID loop0,
const ID loop1,
int  nestLevel0 = 0,
int  nestLevel1 = 0,
int  fusableOverlapThreshold = 1,
int  fusableNonOverlapTolerance = 4,
bool  doSimplify = true 
)

Use Pluto+ algorithm to permute and fuse two loops, with as most parallelizable loops as possible at outermost levels. The two loops are required to be consequent; all directly nested levels are detected and subject to permutation. Remaining levels that cannot be fused are left inside the fused loops as two statements

Parameters
loop0: The first loop to fuse
loop1: The second loop to fuse
nestLevel0: The number of nesting levels of loop 0 to be considered, defaults to maximum possible
nestLevel1: The number of nesting levels of loop 1 to be considered, defaults to maximum possible
fusableOverlapThreshold: The minimum overlapping size of two loops to be regarded fusable. Defaults to 1
fusableNonOverlapTolerance: The maximum non-overlapping size at either side of two loops to be regarded fusable. Defaults to 4
doSimplify: Whether the result is simplified by the way, defaults to true
Returns
std::pair<ID, int> : The ID of fused loop and level of parallelizable loops

◆ plutoPermute()

std::pair< ID, int > freetensor::Schedule::plutoPermute ( const ID loop,
int  nestLevel = 0,
bool  doSimplify = true 
)

Use Pluto+ algorithm to permute a single loop, with as most parallelizable loops as possible at outermost levels.

Parameters
loop: The loop to permute
nestLevel0: The number of nesting levels to be considered, defaults to maximum possible
doSimplify: Whether the result is simplified by the way, defaults to true
Returns
std::pair<ID, int> : The ID of permuted loop and level of parallelizable loops

◆ reorder()

void freetensor::Schedule::reorder ( const std::vector< ID > &  order,
ReorderMode  mode = ReorderMode::PerfectOnly 
)

Reorder directly nested loops

To swap consecutive loops, use swap instead

Parameters
order: Vector of loop IDs. The requested order of the loops
mode: How to deal with imperfectly nested loops. PerfectOnly => throw an exception. MoveOutImperfect => do fission in advance to move out statements between the loops, which may enlarge intermediate tensors. MoveInImperfect => move statements between the loops inwards after adding gurads them them, which may hurt parallelism
Exceptions
InvalidScheduleif the input is invalid or there are breaking dependences

◆ separateTail()

void freetensor::Schedule::separateTail ( bool  noDuplicateVarDefs = false)

Seperate main iterations and tail iterations of a loop

E.g.

for i = 0 -> 3 {
for j = 0 -> 4 {
if (i * 4 + j < 10) {
...
}
}
}

Each loop will be separated into 2 parts: the body and the tail. After simplification, the program will finally be transformed to

for i = 0 -> 2 {
for j = 0 -> 4 {
...
}
}
for j = 0 -> 2 {
...
}

Ideally, all programs can benefit from this schedule. However, this schedule may greatly increase the program size and make the compiling time way too long. Therefore, this transformation is implemented as a schedule, which can be applied optionally. (TODO: Optionally apply this schedule to part of the program)

Parameters
noDuplicateVarDefs: If there is two VarDef nodes in two branches, it may result in doubled memory use, since different thread may go to different branch. Set this parameter to true to stop duplicating VarDef nodes.

◆ setMemType() [1/2]

void freetensor::Schedule::setMemType ( const ID def,
MemType  mtype 
)

Change where a variable is stored

Parameters
def: ID of the VarDef statement of the specific variable
mtype: Where the variable should be stored
rejectIndirectAccess: Registers usually do not support indirect access. If a variable is accessed indirectly, setting it to use registers is meaningless even successful. If this parameter is set to true, throw an exception if the variable being set is accessed indirectly. Specifically, two types of access are considered indirect: 1) The index is a load from another variable, or 2) The index is a loop iterator and the loop has a dynamic length (which can not be unrolled by a backend compiler). By default, this parameter is determined automatically by mtype.
Exceptions
InvalidScheduleif the variable is not found, or if rejecting an indirect access

◆ setMemType() [2/2]

void freetensor::Schedule::setMemType ( const ID def,
MemType  mtype,
bool  rejectIndirectAccess 
)

◆ split()

std::pair< ID, ID > freetensor::Schedule::split ( const ID id,
int  factor = -1,
int  nparts = -1,
int  shift = 0 
)

Split a loop into two nested loops

To fission a loop into two consecutive loops, use fission instead

Two modes are provided:

  1. Specify factor and leave nparts to -1. It will result in an outer loop with length ceil(n / factor), and an inner loop with length factor, where n is the original loop length added by shift. The original iterator i will be transformed to i0 * factor + i1, where i0 and i1 are the iterators of the new outer and inner loops, respectively
  2. Specify nparts and leave factor to -1. It will result in an outer loop with length nparts, and an inner loop with length ceil(n / nparts), where n is the original loop length added by shift. The original iterator i will be transformed to i0 * ceil(n / nparts) + i1, where i0 and i1 are the iterators of the new outer and inner loops, respectively

Please note that the second mode will introduce an i0 * ceil(n / nparts) factor into the program, which cannot be recognized by polyhedral analysis, which may hinder some following schedules. If possible, please use the first mode, and then reorder the inner and outer loops

Suppose the original loop is labeled "L", the split two loops can be selected by "$split.0{L}" (the outer loop) and "$split.1{L}" (the inner loop). If one of the resulting loop is proved to have only a single iteration, it will be removed

Parameters
id: ID of the loop to be split
factor: Length of the inner loop. Set to -1 if using nparts
nparts: Length of the outer loop. Set to -1 if using factor
shift: Shift of iteration base. Defaults to zero
Exceptions
InvalidScheduleif the loop is not found
Returns
: (outer loop ID, inner loop ID), either ID can be invalid if the loop is proved to have only a single iteration

◆ swap()

void freetensor::Schedule::swap ( const std::vector< ID > &  order)

Swap statements in the same block

To reorder nested loops, use reorder instead

Parameters
order: list of IDs of the statements
Exceptions
InvalidScheduleif the statements are not found or the dependences cannot be solved

◆ tuneAutoSchedule()

std::vector< AutoScheduleTuneTrial > freetensor::Schedule::tuneAutoSchedule ( int  nBatch,
int  batchSize,
const Ref< Device > &  device,
const std::vector< Ref< Array > > &  args,
const std::unordered_map< std::string, Ref< Array > > &  kws = {},
const std::regex &  toLearn = std::regex{".*"} 
)

◆ unroll()

void freetensor::Schedule::unroll ( const ID loop,
bool  immediate = false 
)

Unroll a loop

Parameters
loop: ID of the loop
immediate: If false (by default), postpone the unroll procedure to the backend compiler, which saves scheduling time. If true, unroll the loop immediately, which may help further simplifications based on the unrolled result. If your purpose is just to fill the instruction cache, set it to false. If you are unrolling a loop that computes array indices, set it to true
Exceptions
InvalidScheduleif the loop is not found or length of the loop is not a constant

◆ varMerge()

void freetensor::Schedule::varMerge ( const ID def,
int  dim 
)

Merge two dimensions of a variable

Parameters
def: ID of the VarDef statement of the specific variable
dim: Merge the dim-th and the (dim + 1)-th dimension

◆ varReorder()

void freetensor::Schedule::varReorder ( const ID def,
const std::vector< int > &  order 
)

Reorder the dimensions of a variable

Parameters
def: ID of the VarDef statement of the specific variable
order: new order of the dimensions
Exceptions
InvalidScheduleif the variable or the order is illegal

◆ varSplit()

void freetensor::Schedule::varSplit ( const ID def,
int  dim,
VarSplitMode  mode,
int  factor = -1,
int  nparts = -1 
)

Split a dimension of a variable into two

Parameters
def: ID of the VarDef statement of the specific variable
dim: which dimension to be split
mode: When the dimension to split is not divisible by factor or nparts, the resulting shape may become larger. In FixedSize mode, the actual buffer size will not be changed, and gurads will be added to prevent out-of-bound accesses. In RelaxedSize mode, the buffer size may increase. The RelaxedSize mode cannot be applied to I/O variables
factor: Length of the inner (higher no.) dimension. Set to -1 if using nparts
nparts: Length of the outer (lower no.) loop. Set to -1 if using factor
Exceptions
InvalidScheduleif the variable or the dimension is not found

◆ varSqueeze()

void freetensor::Schedule::varSqueeze ( const ID def,
int  dim 
)

Remove a singleton (1-lengthed) dimension from a variable

This is a utility schedule, which can be used together with varSplit, varMerge and/or varReorder to transform a variable to a desired shape.

Parameters
def: ID of the VarDef statement of the specific variable
dim: Remove the dim-th dimension
Exceptions
InvalidScheduleif the variable is not found or the dimension is illegal

◆ varUnsqueeze()

void freetensor::Schedule::varUnsqueeze ( const ID def,
int  dim 
)

Insert a singleton (1-lengthed) dimension to a variable

This is a utility schedule, which can be used together with varSplit, varMerge and/or varReorder to transform a variable to a desired shape.

Parameters
def: ID of the VarDef statement of the specific variable
dim: Insert a singleton dimension at the dim-th dimension
Exceptions
InvalidScheduleif the variable is not found or the dimension is illegal

◆ vectorize()

void freetensor::Schedule::vectorize ( const ID loop)

Vectorize a loop

Please note that, as vectorization is different from architecture to achitecture, the scheduler may or may not postpone it to the backend compiler. The vectorization is a best-effort schedule

Parameters
loop: ID of the loop
Exceptions
InvalidScheduleif the ID or name is not found, or the dependence requirement is not met

◆ verbose()

int freetensor::Schedule::verbose ( ) const
inline

Verbose level


The documentation for this class was generated from the following files: