#include <schedule.h>

Public Types
typedef std::unordered_map< ID, ID >	IDMap

Public Member Functions
	Schedule ()=default

	Schedule (const Stmt &ast, int verbose=0)

	Schedule (const Func &func, int verbose=0)

	Schedule (const Schedule &)=default

Schedule &	operator= (const Schedule &)=default

Schedule	fork () const

Func	func () const

const Stmt &	ast () const

const ScheduleLog &	logs () const

int	verbose () const

template<class T >
std::vector< Stmt >	findAll (const T &filter) const

template<class T >
std::vector< Stmt >	findAtLeastOne (const T &filter) const

template<class T >
Stmt	find (const T &filter) const

std::pair< ID, ID >	split (const ID &id, int factor=-1, int nparts=-1, int shift=0)

void	reorder (const std::vector< ID > &order, ReorderMode mode=ReorderMode::PerfectOnly)

ID	merge (const ID &loop1, const ID &loop2)

std::vector< ID >	permute (const std::vector< ID > &loopsId, const std::function< std::vector< Expr >(std::vector< Expr >)> &transformFunc)

std::pair< IDMap, IDMap >	fission (const ID &loop, FissionSide side, const ID &splitter, bool allowEnlarge=true, const std::string &suffix0=".0", const std::string &suffix1=".1")

void	swap (const std::vector< ID > &order)

void	blend (const ID &loop)

std::tuple< ID, ID, std::string, ID >	cache (const ID &stmt, const std::string &var, MemType mtype)

std::tuple< ID, ID, std::string, ID >	cacheReduction (const ID &stmt, const std::string &var, MemType mtype)

void	varSplit (const ID &def, int dim, VarSplitMode mode, int factor=-1, int nparts=-1)

void	varMerge (const ID &def, int dim)

void	varReorder (const ID &def, const std::vector< int > &order)

void	varUnsqueeze (const ID &def, int dim)

void	varSqueeze (const ID &def, int dim)

std::pair< ID, ID >	moveTo (const ID &stmt, MoveToSide side, const ID &dst)

void	inlining (const ID &def)

void	parallelize (const ID &loop, const ParallelScope &parallel, bool allowReduction=true)

void	parallelizeAs (const ID &nest, const ID &reference, const ID &defId)

void	unroll (const ID &loop, bool immediate=false)

void	vectorize (const ID &loop)

void	separateTail (bool noDuplicateVarDefs=false)

void	asMatMul (const ID &loop, AsMatMulMode mode, const Ref< Target > &target, MatMulBackend backend)

void	asMatMul (const ID &loop, AsMatMulMode mode, const Ref< Target > &target)

void	asMatMul (const ID &loop, AsMatMulMode mode=AsMatMulMode::KeepMemLayout)

std::pair< ID, int >	plutoFuse (const ID &loop0, const ID &loop1, int nestLevel0=0, int nestLevel1=0, int fusableOverlapThreshold=1, int fusableNonOverlapTolerance=4, bool doSimplify=true)

std::pair< ID, int >	plutoPermute (const ID &loop, int nestLevel=0, bool doSimplify=true)

void	autoSchedule (const Ref< Target > &target, const Ref< RandTrace > &trace=nullptr)

void	autoInline (const Ref< Target > &target)

void	autoUseLib (const Ref< Target > &target)

void	autoReorder (const Ref< Target > &target)

void	autoSwap (const Ref< Target > &target)

void	autoPluto (const Ref< Target > &target)

void	autoFissionFuse (const Ref< Target > &target, const Ref< RandTrace > &trace=nullptr)

void	autoMemLayout (const Ref< Target > &target)

void	autoParallelize (const Ref< Target > &target)

void	autoSetMemType (const Ref< Target > &target)

void	autoUnroll (const Ref< Target > &target)

std::vector< AutoScheduleTuneTrial >	tuneAutoSchedule (int nBatch, int batchSize, const Ref< Device > &device, const std::vector< Ref< Array > > &args, const std::unordered_map< std::string, Ref< Array > > &kws={}, const std::regex &toLearn=std::regex{".*"})


void	beginTransaction ()

void	commitTransaction ()

void	abortTransaction ()


ID	fuse (const ID &loop0, const ID &loop1, bool strict=false)

ID	fuse (const ID &loop0, bool strict=false)


void	setMemType (const ID &def, MemType mtype)

void	setMemType (const ID &def, MemType mtype, bool rejectIndirectAccess)

Member Typedef Documentation

◆ IDMap

typedef std::unordered_map<ID, ID> freetensor::Schedule::IDMap

Constructor & Destructor Documentation

◆ Schedule() [1/4]

freetensor::Schedule::Schedule ( )

default

◆ Schedule() [2/4]

freetensor::Schedule::Schedule	(	const Stmt &	ast,
		int	verbose = `0`
	)

◆ Schedule() [3/4]

freetensor::Schedule::Schedule	(	const Func &	func,
		int	verbose = `0`
	)

inline

◆ Schedule() [4/4]

freetensor::Schedule::Schedule ( const Schedule & )

default

Member Function Documentation

◆ abortTransaction()

void freetensor::Schedule::abortTransaction ( )

◆ asMatMul() [1/3]

void freetensor::Schedule::asMatMul	(	const ID &	loop,
		AsMatMulMode	mode,
		const Ref< Target > &	target
	)

◆ asMatMul() [2/3]

void freetensor::Schedule::asMatMul	(	const ID &	loop,
		AsMatMulMode	mode,
		const Ref< Target > &	target,
		MatMulBackend	backend
	)

Transform nested loops to be a external call to a matrix multiplication

Parameters

loop	ID of the loop
mode	: What to do if the memory layout does not meet the requirement from the external library. `KeepMemLayout` => Raise an exception. `TryVarReorder` => try `var_reorder` on some variables, but may affect performance of other use of these variable. `TryTranspose` => try `cache` and then `var_reorder` on some variables, but will incur extra overhead.
target	: Hardware target. If omitted, use the default target in Config, or the target set by `with` scopes.
backend	: Backend library. Defaults to `Mkl` for CPU targets, `Cublas` for GPU targets.

Exceptions

InvalidSchedule if the loop cannot be transformed to be a matrix multiplication

◆ asMatMul() [3/3]

void freetensor::Schedule::asMatMul	(	const ID &	loop,
		AsMatMulMode	mode = `AsMatMulMode::KeepMemLayout`
	)

◆ ast()

const Stmt & freetensor::Schedule::ast ( ) const

Returns: : The statements being transformed, without a function signature

◆ autoFissionFuse()

void freetensor::Schedule::autoFissionFuse	(	const Ref< Target > &	target,
		const Ref< RandTrace > &	trace = `nullptr`
	)

(Experimental) Automatically fuse consecutive loops or vice versa using some heuristics

Parameters

target	: Target architecture
trace	: Random decision tarce

◆ autoInline()

void freetensor::Schedule::autoInline ( const Ref< Target > & target )

(Experimental) Automatically inline very-small VarDef nodes

Parameters

target : Target architecture

◆ autoMemLayout()

void freetensor::Schedule::autoMemLayout ( const Ref< Target > & target )

(Experimental) Automatically adjust memory layout of variables

Parameters

target : Target architecture

◆ autoParallelize()

void freetensor::Schedule::autoParallelize ( const Ref< Target > & target )

(Experimental) Automatically parallelize some loops using some heuristics

Parameters

target : Target architecture

IV. Recurse into sub-loops if failed

◆ autoPluto()

void freetensor::Schedule::autoPluto ( const Ref< Target > & target )

(Experimental) Automatically apply pluto-based schedules

Parameters

target : Target architecture

◆ autoReorder()

void freetensor::Schedule::autoReorder ( const Ref< Target > & target )

(Experimental) Automaticaly reorder loops in a loop nest

Parameters

target : Target architecture

◆ autoSchedule()

void freetensor::Schedule::autoSchedule	(	const Ref< Target > &	target,
		const Ref< RandTrace > &	trace = `nullptr`
	)

(Experimental) Automatic scheduling using some heuristics

Parameters

target	: Target architecture
trace	: Random decision tarce

◆ autoSetMemType()

void freetensor::Schedule::autoSetMemType ( const Ref< Target > & target )

(Experimental) Automatically set memory types using some heuristics

Parameters

target : Target architecture

◆ autoSwap()

void freetensor::Schedule::autoSwap ( const Ref< Target > & target )

(Experimental) Automatically swap statements to enable more fission or fusion

Parameters

target : Target architecture

◆ autoUnroll()

void freetensor::Schedule::autoUnroll ( const Ref< Target > & target )

(Experimental) Automatically unroll loops using some heuristics

Parameters

target : Target architecture

◆ autoUseLib()

void freetensor::Schedule::autoUseLib ( const Ref< Target > & target )

(Experimental) Automatically use external libs using some heuristics

Parameters

target : Target architecture

◆ beginTransaction()

void freetensor::Schedule::beginTransaction ( )

Transaction of schedules

Schedules are applied in transactions. A transaction is created with beginTransaction(), applied as a whole with commitTransaction(), and can be aborted with abortTransaction()

Transactions can be nested. Technically, each schedule is by itself a inner-most transaction, while a Schedule object defines the outer-most transaction, but these inner-most and outer-most transcations are invisible to users

◆ blend()

void freetensor::Schedule::blend ( const ID & loop )

Unroll a loop and interleave statements from each iteration

E.g.

for i = 0 to 2 {
  f(i);
  g(i);
}

will be transformed to be

f(0);
f(1);
g(0);
g(1);

Virtual threads in TVM can be implemented via blend

Parameters

loop	: ID of the loop being transformed

Exceptions

InvalidSchedule if the loop is not found, the loop length is not a constant, or the dependences cannot be solved

◆ cache()

std::tuple< ID, ID, std::string, ID > freetensor::Schedule::cache	(	const ID &	stmt,
		const std::string &	var,
		MemType	mtype
	)

Cache a variable into a new local variable

All needed data will be filled into the cache first, then all reads and writes will be directed to the cache, and finally all needed data will be flushed from the cache

Note for reduction: This transformation preserves the computation order. It will transform

a += x

a += y

to

a.cache = a + x + y

a = a.cache

If you need a "real" cache for reduction, which reorders the computation, use cache_reduction instead

Parameters

stmt	: ID of the statement or block (e.g. an If or a For) to be modified
var	: name of the variable to be cached
mtype	: where to cache

Exceptions

InvalidSchedule if the ID or name is not found

Returns: : (ID of the statement that fills the cache, ID of the statement that flushes from the cache, name of the cache variable, ID of the VarDef node of the cache variable)

◆ cacheReduction()

std::tuple< ID, ID, std::string, ID > freetensor::Schedule::cacheReduction	(	const ID &	stmt,
		const std::string &	var,
		MemType	mtype
	)

Perform local reductions (e.g. sum) in a local variable first, and then reduce the local result to the global variable

E.g.

a += x

a += y

will be transformed to be

a.cache = x + y

a += a.cache

Parameters

stmt	: ID of the statement or block (e.g. an If or a For) to be modified
var	: name of the variable to be cached. Only reductions are allowed on `var` in `stmt`. Plain reads or writes are not allowed
mtype	: where to cache

Exceptions

InvalidSchedule if the ID or name is not found, or there are unsupported reads or writes

Returns: : (ID of the statement that initialize the cache, ID of the statement that reduces the local result to the global result, name of the cache variable, ID of the VarDef node of the cache variable)

◆ commitTransaction()

void freetensor::Schedule::commitTransaction ( )

◆ find()

template<class T >

Stmt freetensor::Schedule::find ( const T & filter ) const

inline

Find the only one nodes in the current AST satisfying a given condition

Parameters

filter : A callback that returns true for acceptance, or a Selector, or an ID

Exceptions

InvalidSchedule : if there is more than one, or there is no node found

◆ findAll()

template<class T >

std::vector< Stmt > freetensor::Schedule::findAll ( const T & filter ) const

inline

Find all nodes (maybe non-existing) in the current AST satisfying a given condition

Parameters

filter : A callback that returns true for acceptance, or a Selector, or an ID

Returns: : All statements satisfying the given condition, in DFS pre order

◆ findAtLeastOne()

template<class T >

std::vector< Stmt > freetensor::Schedule::findAtLeastOne ( const T & filter ) const

inline

Find all nodes (at least one) in the current AST satisfying a given condition

Parameters

filter : A callback that returns true for acceptance, or a Selector, or an ID

◆ fission()

std::pair< Schedule::IDMap, Schedule::IDMap > freetensor::Schedule::fission	(	const ID &	loop,
		FissionSide	side,
		const ID &	splitter,
		bool	allowEnlarge = `true`,
		const std::string &	suffix0 = `".0"`,
		const std::string &	suffix1 = `".1"`
	)

Fission a loop into two loops each containing part of the statements, one followed by another

To split loop into two nested loops, use split instead

Statements inside the original loop will be distributed to one or both (happening if they are scope statements) loops. If a statement is originally labeled "S", it can be selected by "$fission.0{S}" (from the first loop) or "$fission.1{S}" (from the second loop) after fission. If one of the resulting loop has an empty body, it will be removed

Parameters

loop	: ID of the loop to be fissioned
side	: If `After`, `splitter` is the last statement of the first loop. If `Before`, `splitter` is the first statement of the second loop
splitter	: Where to fission the loop
allowEnlarge	: If true, try to avoid dependence by enlarging some `VarDef` nodes. If false, throw `InvalidSchedule` in such cases.
suffix0	: The suffix in the `op` of metadata of result part 0. If empty, the fissioned part 0 preserves original ID and metadata. Cannot be empty together with `suffix1`.
suffix1	: The suffix in the `op` of metadata of result part 1. If empty, the fissioned part 1 preserves original ID and metadata. Cannot be empty together with `suffix0`.

Exceptions

InvalidSchedule if any dependence cannot be resolved

Returns: : ({old ID -> new ID in 1st loop}, {old ID -> new ID in 2nd loop}). If a loop is removed because it has an empty body, it will not be in the returned map

◆ fork()

Schedule freetensor::Schedule::fork ( ) const

inline

Copy the Schedule object for trying different scheduling decisions in the future

The forked object shares the same MemoizedSchedule with the original one, so common decisions can be saved and reused

The forked object shares the same RandCtx objects, so it can learn from multiple scheduling trials

◆ func()

Func freetensor::Schedule::func ( ) const

inline

Returns: : The function being transformed

◆ fuse() [1/2]

ID freetensor::Schedule::fuse	(	const ID &	loop0,
		bool	strict = `false`
	)

◆ fuse() [2/2]

ID freetensor::Schedule::fuse	(	const ID &	loop0,
		const ID &	loop1,
		bool	strict = `false`
	)

Fuse two directly following loops with the same length into one

To merge nested loops into one, use merge instead

parallelize, unroll and vectorize properties will be reset on the fused loop

Suppose the original loops are labeled "L1" and "L2", the fused loop can be selected by "$fuse{L1, L2}"

Parameters

loop0	: ID of the leading loop
loop1	: ID of the following loop. If omitted, it will try to find a following loop of `loop0`
strict	: If true, throw an error if unable to determine whether the two loops are of the same length

Exceptions

InvalidSchedule if the two loops are not directly following, the two loops are not of the same length, or there is any dependence cannot be resolved

Returns: : ID of the result loop

◆ inlining()

void freetensor::Schedule::inlining ( const ID & def )

Remove a variable. When the variable is used, recompute its value

Parameters

def	: ID of the VarDef statement of the specific variable. It can not be an I/O varible

Exceptions

InvalidSchedule if the variable cannot be completely removed

◆ logs()

const ScheduleLog & freetensor::Schedule::logs ( ) const

Returns: : Logs of all schedules applied

◆ merge()

ID freetensor::Schedule::merge	(	const ID &	loop1,
		const ID &	loop2
	)

Merge two directly nested loops into one

To fuse consecutive loops, use fuse instead

parallelize, unroll and vectorize properties will be reset on the merged loop

Suppose the original loops are labeled "L1" and "L2", the merged loop can be selected by "$merge{L1, L2}"

Parameters

loop1,loop2 : ID of the loops to be merged, can be in any order

Exceptions

InvalidSchedule if the loops are not directly nested

Returns: : ID of the merged loop

◆ moveTo()

std::pair< ID, ID > freetensor::Schedule::moveTo	(	const ID &	stmt,
		MoveToSide	side,
		const ID &	dst
	)

Move a statement to a new position

This is a composite schedule command, which is implemented with other commands

If moving a statement out of some loops, identical loops will be added around the moved statement, which is equivalent to fission these loops

Parameters

stmt	: ID of the statement to be moved
side	: Whether `stmt` will be BEFORE or AFTER `dst @param dst : Insert`stmt` to be directly after this statement

Exceptions

InvalidSchedule if there is no feasible path to move

Returns: : (The new ID of the moved statement, The out-most newly introduced statments including the added loops)

◆ operator=()

Schedule & freetensor::Schedule::operator= ( const Schedule & )

default

◆ parallelize()

void freetensor::Schedule::parallelize	(	const ID &	loop,
		const ParallelScope &	parallel,
		bool	allowReduction = `true`
	)

Mark a loop with a parallel implementation

This schedule follows a fork-join model: multiple workers (abstract threads) are created (but physically the threads may be cached in a thread pool) when the loop begins, do their jobs in parallel, and join when the loop ends

OpenMP threads follow a typical fork-join model. CUDA threads run in a bulk-synchronous parallel (BSP) model, which can also be mimiked by the fork-join model: All threads start when the kernel get launched, but they only begin to do their jobs when the parallel loop begins. Nevertheless, the fork-join model needs the following extension to fully mimic a BSP model:

Taking CUDA as an example, we allow binding a loop to threadIdx.x inside another loop bound to threadIdx.x, which is illegal in a classic fork-join model. For example, we may implement a matmul with collaborative fetch as below:

for i : threadIdx.x  # Li
  for j : threadIdx.y  # Lj
    local_sum = 0  # In gpu/local memory, unique to (i, j)
    for k0  # Lk0
      for k : threadIdx.y  # Lk1_a
        A_cache[k] = A[i, k]  # In gpu/shared, shared by different j
      for k : threadIdx.x  # Lk1_b
        B_cache[k] = B[k, j]  # In gpu/shared, shared by different i
      for k  # Lk1_c
        sum += A_cache[k] * B_cache[k]
    C[i, j] = local_sum

A seemingly plausible solution to avoid this extension is to reorder Lk0 to outer-most, and then move Lk1_a and Lk1_b out of Li or Lj. This resolves the nested threadIdx.x and threadIdx.y binding problem by running Li+Lk1_a, Lj+Lk1_b and Li+Lj interleavingly, instead of running Lk1_a and Lk1_b inside Li+Lj. However, this approach is illegal, because the local variable local_sum can no longer be kept inside the body of Li and Lj: It has to be reused across multiple runs of Li and Lj

Please also note that we can bind one threadIdx.x to two loops only when the body statement is loop-invariant to one of them. For example, the following binding is still illegal, even in our extended fork-join model, because it violates its serial semantics:

for i : threadIdx.x
  for j : threadIdx.x
    A[i, j] ++

Parameters

loop	: ID of the loop
parallel	: Parallel scope
allowReduction	: If false, throw InvalidSchedule if this schedule would introduce a parallel reduction

Exceptions

InvalidSchedule if the loop is not found or unable to be parallelized

◆ parallelizeAs()

void freetensor::Schedule::parallelizeAs	(	const ID &	nest,
		const ID &	reference,
		const ID &	defId
	)

Parallelize a loop nest according to another loop nest to keep a tensor thread-local

Parameters

nest	: ID of the loop nest to be parallelized. The ID can be of any statement type, and all statements it contains will be parallelized.
reference	ID of the loop nest to be referenced. The ID can be of any statement type, and all statements it contains will be referenced.
defId	: ID of the VarDef statement of the tensor to be kept thread-local.

Exceptions

InvalidSchedule if any of the ID is not found, or the reference loop nest is already thread-non-local.

◆ permute()

std::vector< ID > freetensor::Schedule::permute	(	const std::vector< ID > &	loopsId,
		const std::function< std::vector< Expr >(std::vector< Expr >)> &	transformFunc
	)

Permute perfectly nested loops (directly nested loops without statements in between) with the given loop space transformation function

The transformed loops follow ascending lexical order of the transformed terms returned by transformFunc when called with original iteration variables

Parameters

loopsId	: the list of IDs of perfectly nested loops to be permuted
transformFunc	: the loop space transformation function, should be bijective

Exceptions

InvalidSchedule if the loops are not perfectly nested, or the permutation is not bijective, or the permutation breaks certain dependence

Returns: : the list of IDs of permuted loops

FIXME: put this into schedule logs

◆ plutoFuse()

std::pair< ID, int > freetensor::Schedule::plutoFuse	(	const ID &	loop0,
		const ID &	loop1,
		int	nestLevel0 = `0`,
		int	nestLevel1 = `0`,
		int	fusableOverlapThreshold = `1`,
		int	fusableNonOverlapTolerance = `4`,
		bool	doSimplify = `true`
	)

Use Pluto+ algorithm to permute and fuse two loops, with as most parallelizable loops as possible at outermost levels. The two loops are required to be consequent; all directly nested levels are detected and subject to permutation. Remaining levels that cannot be fused are left inside the fused loops as two statements

Parameters

loop0	: The first loop to fuse
loop1	: The second loop to fuse
nestLevel0	: The number of nesting levels of loop 0 to be considered, defaults to maximum possible
nestLevel1	: The number of nesting levels of loop 1 to be considered, defaults to maximum possible
fusableOverlapThreshold	: The minimum overlapping size of two loops to be regarded fusable. Defaults to 1
fusableNonOverlapTolerance	: The maximum non-overlapping size at either side of two loops to be regarded fusable. Defaults to 4
doSimplify	: Whether the result is simplified by the way, defaults to true

Returns: std::pair<ID, int> : The ID of fused loop and level of parallelizable loops

◆ plutoPermute()

std::pair< ID, int > freetensor::Schedule::plutoPermute	(	const ID &	loop,
		int	nestLevel = `0`,
		bool	doSimplify = `true`
	)

Use Pluto+ algorithm to permute a single loop, with as most parallelizable loops as possible at outermost levels.

Parameters

loop	: The loop to permute
nestLevel0	: The number of nesting levels to be considered, defaults to maximum possible
doSimplify	: Whether the result is simplified by the way, defaults to true

Returns: std::pair<ID, int> : The ID of permuted loop and level of parallelizable loops

◆ reorder()

void freetensor::Schedule::reorder	(	const std::vector< ID > &	order,
		ReorderMode	mode = `ReorderMode::PerfectOnly`
	)

Reorder directly nested loops

To swap consecutive loops, use swap instead

Parameters

order	: Vector of loop IDs. The requested order of the loops
mode	: How to deal with imperfectly nested loops. `PerfectOnly` => throw an exception. `MoveOutImperfect` => do `fission` in advance to move out statements between the loops, which may enlarge intermediate tensors. `MoveInImperfect` => move statements between the loops inwards after adding gurads them them, which may hurt parallelism

Exceptions

InvalidSchedule if the input is invalid or there are breaking dependences

◆ separateTail()

void freetensor::Schedule::separateTail ( bool noDuplicateVarDefs = false )

Seperate main iterations and tail iterations of a loop

E.g.

for i = 0 -> 3 {
  for j = 0 -> 4 {
     if (i * 4 + j < 10) {
       ...
     }
  }
}

Each loop will be separated into 2 parts: the body and the tail. After simplification, the program will finally be transformed to

for i = 0 -> 2 {
  for j = 0 -> 4 {
    ...
  }
}
for j = 0 -> 2 {
  ...
}

Ideally, all programs can benefit from this schedule. However, this schedule may greatly increase the program size and make the compiling time way too long. Therefore, this transformation is implemented as a schedule, which can be applied optionally. (TODO: Optionally apply this schedule to part of the program)

Parameters

noDuplicateVarDefs : If there is two VarDef nodes in two branches, it may result in doubled memory use, since different thread may go to different branch. Set this parameter to true to stop duplicating VarDef nodes.

◆ setMemType() [1/2]

void freetensor::Schedule::setMemType	(	const ID &	def,
		MemType	mtype
	)

Change where a variable is stored

Parameters

def	: ID of the VarDef statement of the specific variable
mtype	: Where the variable should be stored
rejectIndirectAccess	: Registers usually do not support indirect access. If a variable is accessed indirectly, setting it to use registers is meaningless even successful. If this parameter is set to true, throw an exception if the variable being set is accessed indirectly. Specifically, two types of access are considered indirect: 1) The index is a load from another variable, or 2) The index is a loop iterator and the loop has a dynamic length (which can not be unrolled by a backend compiler). By default, this parameter is determined automatically by `mtype`.

Exceptions

InvalidSchedule if the variable is not found, or if rejecting an indirect access

◆ setMemType() [2/2]

void freetensor::Schedule::setMemType	(	const ID &	def,
		MemType	mtype,
		bool	rejectIndirectAccess
	)

◆ split()

std::pair< ID, ID > freetensor::Schedule::split	(	const ID &	id,
		int	factor = `-1`,
		int	nparts = `-1`,
		int	shift = `0`
	)

Split a loop into two nested loops

To fission a loop into two consecutive loops, use fission instead

Two modes are provided:

Specify factor and leave nparts to -1. It will result in an outer loop with length ceil(n / factor), and an inner loop with length factor, where n is the original loop length added by shift. The original iterator i will be transformed to i0 * factor + i1, where i0 and i1 are the iterators of the new outer and inner loops, respectively
Specify nparts and leave factor to -1. It will result in an outer loop with length nparts, and an inner loop with length ceil(n / nparts), where n is the original loop length added by shift. The original iterator i will be transformed to i0 * ceil(n / nparts) + i1, where i0 and i1 are the iterators of the new outer and inner loops, respectively

Please note that the second mode will introduce an i0 * ceil(n / nparts) factor into the program, which cannot be recognized by polyhedral analysis, which may hinder some following schedules. If possible, please use the first mode, and then reorder the inner and outer loops

Suppose the original loop is labeled "L", the split two loops can be selected by "$split.0{L}" (the outer loop) and "$split.1{L}" (the inner loop). If one of the resulting loop is proved to have only a single iteration, it will be removed

Parameters

id	: ID of the loop to be split
factor	: Length of the inner loop. Set to -1 if using `nparts`
nparts	: Length of the outer loop. Set to -1 if using `factor`
shift	: Shift of iteration base. Defaults to zero

Exceptions

InvalidSchedule if the loop is not found

Returns: : (outer loop ID, inner loop ID), either ID can be invalid if the loop is proved to have only a single iteration

◆ swap()

void freetensor::Schedule::swap ( const std::vector< ID > & order )

Swap statements in the same block

To reorder nested loops, use reorder instead

Parameters

order : list of IDs of the statements

Exceptions

InvalidSchedule if the statements are not found or the dependences cannot be solved

◆ tuneAutoSchedule()

std::vector< AutoScheduleTuneTrial > freetensor::Schedule::tuneAutoSchedule	(	int	nBatch,
		int	batchSize,
		const Ref< Device > &	device,
		const std::vector< Ref< Array > > &	args,
		const std::unordered_map< std::string, Ref< Array > > &	kws = `{}`,
		const std::regex &	toLearn = `std::regex{".*"}`
	)

◆ unroll()

void freetensor::Schedule::unroll	(	const ID &	loop,
		bool	immediate = `false`
	)

Unroll a loop

Parameters

loop	: ID of the loop
immediate	: If false (by default), postpone the unroll procedure to the backend compiler, which saves scheduling time. If true, unroll the loop immediately, which may help further simplifications based on the unrolled result. If your purpose is just to fill the instruction cache, set it to false. If you are unrolling a loop that computes array indices, set it to true

Exceptions

InvalidSchedule if the loop is not found or length of the loop is not a constant

◆ varMerge()

void freetensor::Schedule::varMerge	(	const ID &	def,
		int	dim
	)

Merge two dimensions of a variable

Parameters

def	: ID of the VarDef statement of the specific variable
dim	: Merge the `dim`-th and the `(dim + 1)`-th dimension

◆ varReorder()

void freetensor::Schedule::varReorder	(	const ID &	def,
		const std::vector< int > &	order
	)

Reorder the dimensions of a variable

Parameters

def	: ID of the VarDef statement of the specific variable
order	: new order of the dimensions

Exceptions

InvalidSchedule if the variable or the order is illegal

◆ varSplit()

void freetensor::Schedule::varSplit	(	const ID &	def,
		int	dim,
		VarSplitMode	mode,
		int	factor = `-1`,
		int	nparts = `-1`
	)

Split a dimension of a variable into two

Parameters

def	: ID of the VarDef statement of the specific variable
dim	: which dimension to be split
mode	: When the dimension to split is not divisible by `factor` or `nparts`, the resulting shape may become larger. In `FixedSize` mode, the actual buffer size will not be changed, and gurads will be added to prevent out-of-bound accesses. In `RelaxedSize` mode, the buffer size may increase. The `RelaxedSize` mode cannot be applied to I/O variables
factor	: Length of the inner (higher no.) dimension. Set to -1 if using `nparts`
nparts	: Length of the outer (lower no.) loop. Set to -1 if using `factor`

Exceptions

InvalidSchedule if the variable or the dimension is not found

◆ varSqueeze()

void freetensor::Schedule::varSqueeze	(	const ID &	def,
		int	dim
	)

Remove a singleton (1-lengthed) dimension from a variable

This is a utility schedule, which can be used together with varSplit, varMerge and/or varReorder to transform a variable to a desired shape.

Parameters

def	: ID of the VarDef statement of the specific variable
dim	: Remove the `dim`-th dimension

Exceptions

InvalidSchedule if the variable is not found or the dimension is illegal

◆ varUnsqueeze()

void freetensor::Schedule::varUnsqueeze	(	const ID &	def,
		int	dim
	)

Insert a singleton (1-lengthed) dimension to a variable

This is a utility schedule, which can be used together with varSplit, varMerge and/or varReorder to transform a variable to a desired shape.

Parameters

def	: ID of the VarDef statement of the specific variable
dim	: Insert a singleton dimension at the `dim`-th dimension

Exceptions

InvalidSchedule if the variable is not found or the dimension is illegal

◆ vectorize()

void freetensor::Schedule::vectorize ( const ID & loop )

Vectorize a loop

Please note that, as vectorization is different from architecture to achitecture, the scheduler may or may not postpone it to the backend compiler. The vectorization is a best-effort schedule

Parameters

loop	: ID of the loop

Exceptions

InvalidSchedule if the ID or name is not found, or the dependence requirement is not met

◆ verbose()

int freetensor::Schedule::verbose ( ) const

inline

Verbose level

The documentation for this class was generated from the following files:

include/schedule.h
src/schedule.cc
src/schedule/as_matmul.cc
src/schedule/auto_fission_fuse.cc
src/schedule/auto_inline.cc
src/schedule/auto_mem_layout.cc
src/schedule/auto_parallelize.cc
src/schedule/auto_pluto.cc
src/schedule/auto_reorder.cc
src/schedule/auto_set_mem_type.cc
src/schedule/auto_swap.cc
src/schedule/auto_unroll.cc
src/schedule/auto_use_lib.cc
src/schedule/blend.cc
src/schedule/cache.cc
src/schedule/fission.cc
src/schedule/fuse.cc
src/schedule/inlining.cc
src/schedule/merge.cc
src/schedule/move_to.cc
src/schedule/parallelize.cc
src/schedule/parallelize_as.cc
src/schedule/permute.cc
src/schedule/pluto.cc
src/schedule/reorder.cc
src/schedule/separate_tail.cc
src/schedule/set_mem_type.cc
src/schedule/split.cc
src/schedule/swap.cc
src/schedule/unroll.cc
src/schedule/var_merge.cc
src/schedule/var_reorder.cc
src/schedule/var_split.cc
src/schedule/var_squeeze.cc
src/schedule/var_unsqueeze.cc
src/schedule/vectorize.cc

Public Types

Public Member Functions

Member Typedef Documentation

◆ IDMap

Constructor & Destructor Documentation

◆ Schedule() [1/4]

◆ Schedule() [2/4]

◆ Schedule() [3/4]

◆ Schedule() [4/4]

Member Function Documentation

◆ abortTransaction()

◆ asMatMul() [1/3]

◆ asMatMul() [2/3]

◆ asMatMul() [3/3]

◆ ast()

◆ autoFissionFuse()

◆ autoInline()

◆ autoMemLayout()

◆ autoParallelize()

◆ autoPluto()

◆ autoReorder()

◆ autoSchedule()

◆ autoSetMemType()

◆ autoSwap()

◆ autoUnroll()

◆ autoUseLib()

◆ beginTransaction()

◆ blend()

◆ cache()

◆ cacheReduction()

◆ commitTransaction()

◆ find()

◆ findAll()

◆ findAtLeastOne()

◆ fission()

◆ fork()

◆ func()

◆ fuse() [1/2]

◆ fuse() [2/2]

◆ inlining()

◆ logs()

◆ merge()

◆ moveTo()

◆ operator=()

◆ parallelize()

◆ parallelizeAs()

◆ permute()

◆ plutoFuse()

◆ plutoPermute()

◆ reorder()

◆ separateTail()

◆ setMemType() [1/2]

◆ setMemType() [2/2]

◆ split()

◆ swap()

◆ tuneAutoSchedule()

◆ unroll()

◆ varMerge()

◆ varReorder()

◆ varSplit()

◆ varSqueeze()

◆ varUnsqueeze()

◆ vectorize()

◆ verbose()