doxygen/html/make__parallel__reduction_8h_source.html

#ifndef FREE_TENSOR_MAKE_PARLLEL_REDUCTION_H

#define FREE_TENSOR_MAKE_PARLLEL_REDUCTION_H


#include <memory>

#include <unordered_map>

#include <unordered_set>


#include <analyze/comp_transient_bounds.h>

#include <analyze/comp_unique_bounds.h>

#include <analyze/find_loop_variance.h>

#include <analyze/symbol_table.h>

#include <driver/target.h>

#include <func.h>

#include <mutator.h>

#include <visitor.h>


namespace freetensor {


struct ParallelInfo {

    ParallelScope type_;         // parallel type

    std::vector<ID> outerLoops_; // outer loop ID

};


class FindAllParallel : public Visitor {

    // Loop ID -> ParallelInfo

    std::unordered_map<ID, ParallelInfo> results_;


    std::vector<ID> loopStack_;


  public:

    const std::unordered_map<ID, ParallelInfo> &results() const {

        return results_;

    }


  protected:

    void visit(const For &op) override;

};


class FindSerialLoopsOverReduce : public Visitor {

    std::unordered_map<ID, std::vector<For>>

        results_; // ReduceTo ID -> [For], from inner to outer

    std::vector<For> loopStack_;


  public:

    const std::unordered_map<ID, std::vector<For>> &results() const {

        return results_;

    }


  protected:

    void visit(const For &op) override;

    void visit(const ReduceTo &op) override;

};


class MakeLoopCarriedReduction

    : public CompTransientBounds<SymbolTable<Mutator>> {

    typedef CompTransientBounds<SymbolTable<Mutator>> BaseClass;


    struct ReductionItemFactors {

        ReduceOp op_;

        std::string var_;

        std::vector<std::vector<Ref<CompUniqueBounds::Bound>>>

            bound_; // [dim][access]

        bool syncFlush_;

    };


    const std::unordered_map<ID, std::unordered_set<ID>>

        &toAlter_; // ReduceTo ID -> Racing For ID

    const LoopVariExprMap &variantMap_;


    // ReduceTo IDs. For all reductions in `toAlter`, we first try to lower them

    // as loop-carried reductions. If impossible, we then insert them to this

    // map, which is passed to `MakeSyncReduction`.

    std::unordered_set<ID> toUseSync_;


    std::unordered_map<ID, ParallelScope> paraScopes_; // For Id -> parallel

    std::unordered_map<ID, std::vector<ReductionItemFactors>> forReductions_;

    std::unordered_map<ID, std::unordered_set<std::string>>

        scopeDefined_; // For ID -> definitions at that scope


    std::vector<ID> paraLoopStack_;


  private:

    bool needSync(const ReduceTo &op, const ID &loopId);


  public:

    MakeLoopCarriedReduction(

        const std::unordered_map<ID, std::unordered_set<ID>> &toAlter,

        const LoopVariExprMap &variantMap)

        : toAlter_(toAlter), variantMap_(variantMap) {}


    const auto &toUseSync() const { return toUseSync_; }


  protected:

    using BaseClass::visit;

    Stmt visit(const ReduceTo &op) override;

    Stmt visit(const For &op) override;

};


class MakeSyncReduction : public SymbolTable<Mutator> {

    typedef SymbolTable<Mutator> BaseClass;


    const std::unordered_set<ID> &toUseSync_;

    const std::unordered_map<ID, std::vector<For>>

        &serialOverRed_; // ReduceTo ID -> [For], from inner to outer

    const LoopVariExprMap &variantMap_;


#if defined(__GNUC__) && !defined(__clang__)

    // GCC<12 dose not support [[maybe_unused]] on member vars

    const Ref<Target> &target_;

#else

    [[maybe_unused]] /* used only if FT_WITH_CUDA */ const Ref<Target> &target_;

#endif


    struct SyncCacheInfo {

        ReduceTo oldNode_;

        std::vector<Expr> newShape_, newTargetIndices_;

        std::vector<bool> preserveDim_;

    };

    std::unordered_map<ID,

                       std::vector<SyncCacheInfo>>

        cacheSync_; // loop ID -> [SyncCacheInfo]


    int64_t gpuThreadDim_ = 1;


  private:

    bool canResideInGPULocal(DataType dtype,

                             const std::vector<Expr> &shape) const;


    MemType localMType(MemType mtype, DataType dtype,

                       const std::vector<Expr> &shape) const;


  public:

    MakeSyncReduction(

        const std::unordered_set<ID> &toUseSync,

        const std::unordered_map<ID, std::vector<For>> &serialOverRed,

        const LoopVariExprMap &variantMap, const Ref<Target> &target)

        : toUseSync_(toUseSync), serialOverRed_(serialOverRed),

          variantMap_(variantMap), target_(target) {}


  protected:

    using BaseClass::visit;

    Stmt visit(const ReduceTo &op) override;

    Stmt visit(const For &op) override;

};


Stmt makeParallelReduction(const Stmt &op, const Ref<Target> &target);


DEFINE_PASS_FOR_FUNC(makeParallelReduction)


} // namespace freetensor


#endif // FREE_TENSOR_MAKE_PARLLEL_REDUCTION_H

freetensor::CompTransientBounds
Definition: comp_transient_bounds.h:50

freetensor::CompTransientBounds< SymbolTable< Mutator > >::visit
BaseClass::StmtRetType visit(const For &op) override
Definition: comp_transient_bounds.h:128

freetensor::DataType
Definition: data_type.h:106

freetensor::FindAllParallel
Definition: make_parallel_reduction.h:24

freetensor::FindAllParallel::results
const std::unordered_map< ID, ParallelInfo > & results() const
Definition: make_parallel_reduction.h:31

freetensor::FindAllParallel::visit
void visit(const For &op) override
Definition: make_parallel_reduction.cc:29

freetensor::FindSerialLoopsOverReduce
Definition: make_parallel_reduction.h:39

freetensor::FindSerialLoopsOverReduce::results
const std::unordered_map< ID, std::vector< For > > & results() const
Definition: make_parallel_reduction.h:45

freetensor::FindSerialLoopsOverReduce::visit
void visit(const For &op) override
Definition: make_parallel_reduction.cc:39

freetensor::ID
Definition: id.h:18

freetensor::MakeLoopCarriedReduction
Definition: make_parallel_reduction.h:61

freetensor::MakeLoopCarriedReduction::toUseSync
const auto & toUseSync() const
Definition: make_parallel_reduction.h:97

freetensor::MakeLoopCarriedReduction::MakeLoopCarriedReduction
MakeLoopCarriedReduction(const std::unordered_map< ID, std::unordered_set< ID > > &toAlter, const LoopVariExprMap &variantMap)
Definition: make_parallel_reduction.h:92

freetensor::MakeLoopCarriedReduction::visit
Stmt visit(const ReduceTo &op) override
Definition: make_parallel_reduction.cc:71

freetensor::MakeSyncReduction
Definition: make_parallel_reduction.h:109

freetensor::MakeSyncReduction::visit
Stmt visit(const ReduceTo &op) override
Definition: make_parallel_reduction.cc:229

freetensor::MakeSyncReduction::MakeSyncReduction
MakeSyncReduction(const std::unordered_set< ID > &toUseSync, const std::unordered_map< ID, std::vector< For > > &serialOverRed, const LoopVariExprMap &variantMap, const Ref< Target > &target)
Definition: make_parallel_reduction.h:171

freetensor::Ref< ForNode >

freetensor::SymbolTable
Definition: symbol_table.h:122

freetensor::SymbolTable< Mutator >::visit
BaseClass::StmtRetType visit(const VarDef &op) override
Definition: symbol_table.h:167

freetensor::Visitor
Definition: visitor.h:11

comp_transient_bounds.h

comp_unique_bounds.h

find_loop_variance.h

func.h

DEFINE_PASS_FOR_FUNC
#define DEFINE_PASS_FOR_FUNC(pass)
Definition: func.h:88

mutator.h

freetensor
Definition: allocator.h:9

freetensor::LoopVariExprMap
std::unordered_map< StmtOrExprID, std::unordered_map< ID, LoopVariability > > LoopVariExprMap
Definition: find_loop_variance.h:26

freetensor::ParallelScope
std::variant< SerialScope, OpenMPScope, CUDAStreamScope, CUDAScope > ParallelScope
Definition: parallel_scope.h:73

freetensor::Stmt
Ref< StmtNode > Stmt
Definition: ast.h:152

freetensor::ReduceOp
ReduceOp
Definition: reduce_op.h:30

freetensor::makeParallelReduction
Stmt makeParallelReduction(const Stmt &op, const Ref< Target > &target)
Definition: make_parallel_reduction.cc:376

freetensor::MemType
MemType
Definition: mem_type.h:14

freetensor::ParallelInfo
Definition: make_parallel_reduction.h:19

freetensor::ParallelInfo::outerLoops_
std::vector< ID > outerLoops_
Definition: make_parallel_reduction.h:21

freetensor::ParallelInfo::type_
ParallelScope type_
Definition: make_parallel_reduction.h:20

symbol_table.h

target.h

visitor.h