From 491be807d7b1097e89e536510be405bf8f1e12df Mon Sep 17 00:00:00 2001
From: pineappleEA <pineaea@gmail.com>
Date: Sun, 11 Feb 2024 00:24:12 +0100
Subject: [PATCH] early-access version 4130

---
 README.md                                     |   2 +-
 .../hle/service/nvdrv/devices/nvhost_gpu.cpp  |   7 +
 src/video_core/control/channel_state.h        |   6 +
 src/video_core/control/scheduler.cpp          | 194 +++++++++++++++++-
 src/video_core/control/scheduler.h            |  22 +-
 src/video_core/engines/puller.cpp             |  12 +-
 src/video_core/gpu.cpp                        |   8 +
 src/video_core/gpu.h                          |   9 +-
 src/video_core/gpu_thread.cpp                 |  14 +-
 src/video_core/gpu_thread.h                   |   9 +-
 src/video_core/texture_cache/image_info.cpp   |   1 -
 src/video_core/texture_cache/image_info.h     |   1 -
 src/video_core/texture_cache/texture_cache.h  |  15 +-
 13 files changed, 258 insertions(+), 42 deletions(-)

diff --git a/README.md b/README.md
index b647f6931..0734f6955 100755
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 yuzu emulator early access
 =============
 
-This is the source code for early-access 4129.
+This is the source code for early-access 4130.
 
 ## Legal Notice
 
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
index e71c58aa9..f537fb203 100755
--- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
@@ -13,6 +13,7 @@
 #include "core/hle/service/nvdrv/nvdrv.h"
 #include "core/memory.h"
 #include "video_core/control/channel_state.h"
+#include "video_core/control/scheduler.h"
 #include "video_core/engines/puller.h"
 #include "video_core/gpu.h"
 #include "video_core/host1x/host1x.h"
@@ -33,6 +34,7 @@ nvhost_gpu::nvhost_gpu(Core::System& system_, EventInterface& events_interface_,
       syncpoint_manager{core_.GetSyncpointManager()}, nvmap{core.GetNvMapFile()},
       channel_state{system.GPU().AllocateChannel()} {
     channel_syncpoint = syncpoint_manager.AllocateSyncpoint(false);
+    channel_state->syncpoint_id = channel_syncpoint;
     sm_exception_breakpoint_int_report_event =
         events_interface.CreateEvent("GpuChannelSMExceptionBreakpointInt");
     sm_exception_breakpoint_pause_report_event =
@@ -157,6 +159,9 @@ NvResult nvhost_gpu::SetErrorNotifier(IoctlSetErrorNotifier& params) {
 
 NvResult nvhost_gpu::SetChannelPriority(IoctlChannelSetPriority& params) {
     channel_priority = params.priority;
+    if (channel_state->initialized) {
+        system.GPU().Scheduler().ChangePriority(channel_state->bind_id, channel_priority);
+    }
     LOG_DEBUG(Service_NVDRV, "(STUBBED) called, priority={:X}", channel_priority);
     return NvResult::Success;
 }
@@ -314,6 +319,7 @@ NvResult nvhost_gpu::GetWaitbase(IoctlGetWaitbase& params) {
 NvResult nvhost_gpu::ChannelSetTimeout(IoctlChannelSetTimeout& params) {
     LOG_INFO(Service_NVDRV, "called, timeout=0x{:X}", params.timeout);
 
+    channel_state->timeout = params.timeout;
     return NvResult::Success;
 }
 
@@ -321,6 +327,7 @@ NvResult nvhost_gpu::ChannelSetTimeslice(IoctlSetTimeslice& params) {
     LOG_INFO(Service_NVDRV, "called, timeslice=0x{:X}", params.timeslice);
 
     channel_timeslice = params.timeslice;
+    channel_state->timeslice = params.timeslice;
 
     return NvResult::Success;
 }
diff --git a/src/video_core/control/channel_state.h b/src/video_core/control/channel_state.h
index ceaa92647..9f9b4ff75 100755
--- a/src/video_core/control/channel_state.h
+++ b/src/video_core/control/channel_state.h
@@ -45,6 +45,12 @@ struct ChannelState {
     void BindRasterizer(VideoCore::RasterizerInterface* rasterizer);
 
     s32 bind_id = -1;
+    /// Scheduling info
+    u32 syncpoint_id = 0xFFFF;
+    u32 priority = 0;
+    u32 timeslice = 0;
+    u32 timeout = 0;
+
     /// 3D engine
     std::unique_ptr<Engines::Maxwell3D> maxwell_3d;
     /// 2D engine
diff --git a/src/video_core/control/scheduler.cpp b/src/video_core/control/scheduler.cpp
index 31e3b1235..91d81500a 100755
--- a/src/video_core/control/scheduler.cpp
+++ b/src/video_core/control/scheduler.cpp
@@ -1,32 +1,204 @@
 // SPDX-FileCopyrightText: 2021 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-3.0-or-later
 
+#include <atomic>
+#include <deque>
+#include <map>
 #include <memory>
+#include <mutex>
+#include <unordered_map>
 
 #include "common/assert.h"
-#include "video_core/control/channel_state.h"
+#include "common/fiber.h"
 #include "video_core/control/scheduler.h"
+#include "video_core/dma_pusher.h"
 #include "video_core/gpu.h"
 
 namespace Tegra::Control {
-Scheduler::Scheduler(GPU& gpu_) : gpu{gpu_} {}
+
+struct GPFifoContext {
+    bool is_active;
+    bool is_running;
+    std::shared_ptr<Common::Fiber> context;
+    std::deque<CommandList> pending_work;
+    std::mutex guard;
+    s32 bind_id;
+    std::shared_ptr<ChannelState> info;
+    size_t yield_count;
+    size_t scheduled_count;
+};
+
+struct Scheduler::SchedulerImpl {
+    // Fifos
+    std::map<u32, std::list<size_t>, std::greater<u32>> schedule_priority_queue;
+    std::unordered_map<s32, size_t> channel_gpfifo_ids;
+    std::deque<GPFifoContext> gpfifos;
+    std::deque<size_t> free_fifos;
+
+    // Scheduling
+    std::mutex scheduling_guard;
+    std::shared_ptr<Common::Fiber> master_control;
+    bool must_reschedule{};
+    GPFifoContext* current_fifo{};
+};
+
+Scheduler::Scheduler(GPU& gpu_) : gpu{gpu_} {
+    impl = std::make_unique<SchedulerImpl>();
+}
 
 Scheduler::~Scheduler() = default;
 
+void Scheduler::Init() {
+    impl->master_control = Common::Fiber::ThreadToFiber();
+}
+
+void Scheduler::Resume() {
+    bool pending_work;
+    do {
+        pending_work = false;
+        {
+            std::unique_lock lk(impl->scheduling_guard);
+            impl->current_fifo = nullptr;
+            auto it = impl->schedule_priority_queue.begin();
+            while (it != impl->schedule_priority_queue.end()) {
+                pending_work = ScheduleLevel(it->second);
+                if (pending_work) {
+                    break;
+                }
+                it = std::next(it);
+            }
+            if (pending_work) {
+                impl->must_reschedule = false;
+            }
+        }
+        if (impl->current_fifo) {
+            impl->current_fifo->scheduled_count++;
+            Common::Fiber::YieldTo(impl->master_control, *impl->current_fifo->context);
+        }
+    } while (pending_work);
+}
+
+bool Scheduler::ScheduleLevel(std::list<size_t>& queue) {
+    bool found_anything = false;
+    size_t min_schedule_count = std::numeric_limits<size_t>::max();
+    for (auto id : queue) {
+        auto& fifo = impl->gpfifos[id];
+        std::scoped_lock lk2(fifo.guard);
+        if (!fifo.pending_work.empty() || fifo.is_running) {
+            if (fifo.scheduled_count > min_schedule_count) {
+                continue;
+            }
+            if (fifo.scheduled_count < fifo.yield_count) {
+                fifo.scheduled_count++;
+                continue;
+            }
+            min_schedule_count = fifo.scheduled_count;
+            impl->current_fifo = &fifo;
+            found_anything = true;
+        }
+    }
+    return found_anything;
+}
+
+void Scheduler::ChangePriority(s32 channel_id, u32 new_priority) {
+    std::unique_lock lk(impl->scheduling_guard);
+    auto fifo_it = impl->channel_gpfifo_ids.find(channel_id);
+    if (fifo_it == impl->channel_gpfifo_ids.end()) {
+        return;
+    }
+    const size_t fifo_id = fifo_it->second;
+    auto& fifo = impl->gpfifos[fifo_id];
+    const auto old_priority = fifo.info->priority;
+    fifo.info->priority = new_priority;
+    impl->schedule_priority_queue.try_emplace(new_priority);
+    impl->schedule_priority_queue[new_priority].push_back(fifo_id);
+    impl->schedule_priority_queue[old_priority].remove_if(
+        [fifo_id](size_t id) { return id == fifo_id; });
+}
+
+void Scheduler::Yield() {
+    ASSERT(impl->current_fifo != nullptr);
+    impl->current_fifo->yield_count = impl->current_fifo->scheduled_count + 1;
+    Common::Fiber::YieldTo(impl->current_fifo->context, *impl->master_control);
+    gpu.BindChannel(impl->current_fifo->bind_id);
+}
+
+void Scheduler::CheckStatus() {
+    {
+        std::unique_lock lk(impl->scheduling_guard);
+        if (!impl->must_reschedule) {
+            return;
+        }
+    }
+    Common::Fiber::YieldTo(impl->current_fifo->context, *impl->master_control);
+    gpu.BindChannel(impl->current_fifo->bind_id);
+}
+
 void Scheduler::Push(s32 channel, CommandList&& entries) {
-    std::unique_lock lk(scheduling_guard);
-    auto it = channels.find(channel);
-    ASSERT(it != channels.end());
-    auto channel_state = it->second;
-    gpu.BindChannel(channel_state->bind_id);
-    channel_state->dma_pusher->Push(std::move(entries));
-    channel_state->dma_pusher->DispatchCalls();
+    std::unique_lock lk(impl->scheduling_guard);
+    auto it = impl->channel_gpfifo_ids.find(channel);
+    ASSERT(it != impl->channel_gpfifo_ids.end());
+    auto gpfifo_id = it->second;
+    auto& fifo = impl->gpfifos[gpfifo_id];
+    {
+        std::scoped_lock lk2(fifo.guard);
+        fifo.pending_work.emplace_back(std::move(entries));
+    }
+    if (impl->current_fifo != nullptr && impl->current_fifo->info->priority < fifo.info->priority) {
+        impl->must_reschedule = true;
+    }
+}
+
+void Scheduler::ChannelLoop(size_t gpfifo_id, s32 channel_id) {
+    gpu.BindChannel(channel_id);
+    auto& fifo = impl->gpfifos[gpfifo_id];
+    while (true) {
+        auto* channel_state = fifo.info.get();
+        fifo.guard.lock();
+        while (!fifo.pending_work.empty()) {
+            fifo.is_running = true;
+            {
+                CommandList&& entries = std::move(fifo.pending_work.front());
+                channel_state->dma_pusher->Push(std::move(entries));
+                fifo.pending_work.pop_front();
+            }
+            fifo.guard.unlock();
+            channel_state->dma_pusher->DispatchCalls();
+            CheckStatus();
+            fifo.guard.lock();
+        }
+        fifo.is_running = false;
+        fifo.guard.unlock();
+        Common::Fiber::YieldTo(fifo.context, *impl->master_control);
+        gpu.BindChannel(channel_id);
+    }
 }
 
 void Scheduler::DeclareChannel(std::shared_ptr<ChannelState> new_channel) {
     s32 channel = new_channel->bind_id;
-    std::unique_lock lk(scheduling_guard);
-    channels.emplace(channel, new_channel);
+    std::unique_lock lk(impl->scheduling_guard);
+
+    size_t new_fifo_id;
+    if (!impl->free_fifos.empty()) {
+        new_fifo_id = impl->free_fifos.front();
+        impl->free_fifos.pop_front();
+    } else {
+        new_fifo_id = impl->gpfifos.size();
+        impl->gpfifos.emplace_back();
+    }
+    auto& new_fifo = impl->gpfifos[new_fifo_id];
+    impl->channel_gpfifo_ids[channel] = new_fifo_id;
+    new_fifo.is_active = true;
+    new_fifo.bind_id = channel;
+    new_fifo.pending_work.clear();
+    new_fifo.info = new_channel;
+    new_fifo.scheduled_count = 0;
+    new_fifo.yield_count = 0;
+    new_fifo.is_running = false;
+    impl->schedule_priority_queue.try_emplace(new_channel->priority);
+    impl->schedule_priority_queue[new_channel->priority].push_back(new_fifo_id);
+    std::function<void()> callback = std::bind(&Scheduler::ChannelLoop, this, new_fifo_id, channel);
+    new_fifo.context = std::make_shared<Common::Fiber>(std::move(callback));
 }
 
 } // namespace Tegra::Control
diff --git a/src/video_core/control/scheduler.h b/src/video_core/control/scheduler.h
index eab52e536..faa888dde 100755
--- a/src/video_core/control/scheduler.h
+++ b/src/video_core/control/scheduler.h
@@ -3,10 +3,11 @@
 
 #pragma once
 
+#include <list>
 #include <memory>
-#include <mutex>
-#include <unordered_map>
 
+#include "common/common_types.h"
+#include "video_core/control/channel_state.h"
 #include "video_core/dma_pusher.h"
 
 namespace Tegra {
@@ -22,13 +23,26 @@ public:
     explicit Scheduler(GPU& gpu_);
     ~Scheduler();
 
+    void Init();
+
+    void Resume();
+
+    void Yield();
+
     void Push(s32 channel, CommandList&& entries);
 
     void DeclareChannel(std::shared_ptr<ChannelState> new_channel);
 
+    void ChangePriority(s32 channel_id, u32 new_priority);
+
 private:
-    std::unordered_map<s32, std::shared_ptr<ChannelState>> channels;
-    std::mutex scheduling_guard;
+    void ChannelLoop(size_t gpfifo_id, s32 channel_id);
+    bool ScheduleLevel(std::list<size_t>& queue);
+    void CheckStatus();
+
+    struct SchedulerImpl;
+    std::unique_ptr<SchedulerImpl> impl;
+
     GPU& gpu;
 };
 
diff --git a/src/video_core/engines/puller.cpp b/src/video_core/engines/puller.cpp
index 79d84d662..c39cada43 100755
--- a/src/video_core/engines/puller.cpp
+++ b/src/video_core/engines/puller.cpp
@@ -6,6 +6,7 @@
 #include "common/settings.h"
 #include "core/core.h"
 #include "video_core/control/channel_state.h"
+#include "video_core/control/scheduler.h"
 #include "video_core/dma_pusher.h"
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/engines/kepler_compute.h"
@@ -14,6 +15,8 @@
 #include "video_core/engines/maxwell_dma.h"
 #include "video_core/engines/puller.h"
 #include "video_core/gpu.h"
+#include "video_core/host1x/host1x.h"
+#include "video_core/host1x/syncpoint_manager.h"
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
 
@@ -60,11 +63,14 @@ void Puller::ProcessBindMethod(const MethodCall& method_call) {
 }
 
 void Puller::ProcessFenceActionMethod() {
+    auto& syncpoint_manager = gpu.Host1x().GetSyncpointManager();
     switch (regs.fence_action.op) {
     case Puller::FenceOperation::Acquire:
-        // UNIMPLEMENTED_MSG("Channel Scheduling pending.");
-        // WaitFence(regs.fence_action.syncpoint_id, regs.fence_value);
-        rasterizer->ReleaseFences();
+        while (regs.fence_value >
+               syncpoint_manager.GetGuestSyncpointValue(regs.fence_action.syncpoint_id)) {
+            rasterizer->ReleaseFences();
+            gpu.Scheduler().Yield();
+        }
         break;
     case Puller::FenceOperation::Increment:
         rasterizer->SignalSyncPoint(regs.fence_action.syncpoint_id);
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 59356015b..f45f797b3 100755
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -387,6 +387,14 @@ std::shared_ptr<Control::ChannelState> GPU::AllocateChannel() {
     return impl->AllocateChannel();
 }
 
+Tegra::Control::Scheduler& GPU::Scheduler() {
+    return *impl->scheduler;
+}
+
+const Tegra::Control::Scheduler& GPU::Scheduler() const {
+    return *impl->scheduler;
+}
+
 void GPU::InitChannel(Control::ChannelState& to_init) {
     impl->InitChannel(to_init);
 }
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index 25c75a109..154466f23 100755
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -124,7 +124,8 @@ class KeplerCompute;
 
 namespace Control {
 struct ChannelState;
-}
+class Scheduler;
+} // namespace Control
 
 namespace Host1x {
 class Host1x;
@@ -204,6 +205,12 @@ public:
     /// Returns a const reference to the shader notifier.
     [[nodiscard]] const VideoCore::ShaderNotify& ShaderNotify() const;
 
+    /// Returns GPU Channel Scheduler.
+    [[nodiscard]] Tegra::Control::Scheduler& Scheduler();
+
+    /// Returns GPU Channel Scheduler.
+    [[nodiscard]] const Tegra::Control::Scheduler& Scheduler() const;
+
     [[nodiscard]] u64 GetTicks() const;
 
     [[nodiscard]] bool IsAsync() const;
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index 0832234af..3aa59d034 100755
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -34,13 +34,15 @@ static void RunThread(std::stop_token stop_token, Core::System& system,
 
     CommandDataContainer next;
 
+    scheduler.Init();
+
     while (!stop_token.stop_requested()) {
         state.queue.PopWait(next, stop_token);
         if (stop_token.stop_requested()) {
             break;
         }
-        if (auto* submit_list = std::get_if<SubmitListCommand>(&next.data)) {
-            scheduler.Push(submit_list->channel, std::move(submit_list->entries));
+        if (std::holds_alternative<SubmitListCommand>(next.data)) {
+            scheduler.Resume();
         } else if (std::holds_alternative<GPUTickCommand>(next.data)) {
             system.GPU().TickWork();
         } else if (const auto* flush = std::get_if<FlushRegionCommand>(&next.data)) {
@@ -67,14 +69,16 @@ ThreadManager::~ThreadManager() = default;
 
 void ThreadManager::StartThread(VideoCore::RendererBase& renderer,
                                 Core::Frontend::GraphicsContext& context,
-                                Tegra::Control::Scheduler& scheduler) {
+                                Tegra::Control::Scheduler& scheduler_) {
     rasterizer = renderer.ReadRasterizer();
+    scheduler = &scheduler_;
     thread = std::jthread(RunThread, std::ref(system), std::ref(renderer), std::ref(context),
-                          std::ref(scheduler), std::ref(state));
+                          std::ref(scheduler_), std::ref(state));
 }
 
 void ThreadManager::SubmitList(s32 channel, Tegra::CommandList&& entries) {
-    PushCommand(SubmitListCommand(channel, std::move(entries)));
+    scheduler->Push(channel, std::move(entries));
+    PushCommand(SubmitListCommand());
 }
 
 void ThreadManager::FlushRegion(DAddr addr, u64 size) {
diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h
index 1a0de86b5..a4b01c42a 100755
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -36,13 +36,7 @@ class RendererBase;
 namespace VideoCommon::GPUThread {
 
 /// Command to signal to the GPU thread that a command list is ready for processing
-struct SubmitListCommand final {
-    explicit SubmitListCommand(s32 channel_, Tegra::CommandList&& entries_)
-        : channel{channel_}, entries{std::move(entries_)} {}
-
-    s32 channel;
-    Tegra::CommandList entries;
-};
+struct SubmitListCommand final {};
 
 /// Command to signal to the GPU thread to flush a region
 struct FlushRegionCommand final {
@@ -124,6 +118,7 @@ public:
 private:
     /// Pushes a command to be executed by the GPU thread
     u64 PushCommand(CommandData&& command_data, bool block = false);
+    Tegra::Control::Scheduler* scheduler;
 
     Core::System& system;
     const bool is_async;
diff --git a/src/video_core/texture_cache/image_info.cpp b/src/video_core/texture_cache/image_info.cpp
index 4cfaf7ce6..22eb7bd00 100755
--- a/src/video_core/texture_cache/image_info.cpp
+++ b/src/video_core/texture_cache/image_info.cpp
@@ -42,7 +42,6 @@ ImageInfo::ImageInfo(const TICEntry& config) noexcept {
         };
     }
     rescaleable = false;
-    is_sparse = config.is_sparse != 0;
     tile_width_spacing = config.tile_width_spacing;
     if (config.texture_type != TextureType::Texture2D &&
         config.texture_type != TextureType::Texture2DNoMipmap) {
diff --git a/src/video_core/texture_cache/image_info.h b/src/video_core/texture_cache/image_info.h
index 286457acc..0a50795b3 100755
--- a/src/video_core/texture_cache/image_info.h
+++ b/src/video_core/texture_cache/image_info.h
@@ -41,7 +41,6 @@ struct ImageInfo {
     bool downscaleable = false;
     bool forced_flushed = false;
     bool dma_downloaded = false;
-    bool is_sparse = false;
 };
 
 } // namespace VideoCommon
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index 47ea0bd96..85046e708 100755
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -600,17 +600,17 @@ void TextureCache<P>::UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t siz
                             [&](ImageId id, Image&) { deleted_images.push_back(id); });
     for (const ImageId id : deleted_images) {
         Image& image = slot_images[id];
-        if (False(image.flags & ImageFlagBits::CpuModified)) {
-            image.flags |= ImageFlagBits::CpuModified;
-            if (True(image.flags & ImageFlagBits::Tracked)) {
-                UntrackImage(image, id);
-            }
+        if (True(image.flags & ImageFlagBits::CpuModified)) {
+            continue;
         }
-
+        image.flags |= ImageFlagBits::CpuModified;
         if (True(image.flags & ImageFlagBits::Remapped)) {
             continue;
         }
         image.flags |= ImageFlagBits::Remapped;
+        if (True(image.flags & ImageFlagBits::Tracked)) {
+            UntrackImage(image, id);
+        }
     }
 }
 
@@ -1469,8 +1469,7 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, DA
     const ImageId new_image_id = slot_images.insert(runtime, new_info, gpu_addr, cpu_addr);
     Image& new_image = slot_images[new_image_id];
 
-    if (!gpu_memory->IsContinuousRange(new_image.gpu_addr, new_image.guest_size_bytes) &&
-        new_info.is_sparse) {
+    if (!gpu_memory->IsContinuousRange(new_image.gpu_addr, new_image.guest_size_bytes)) {
         new_image.flags |= ImageFlagBits::Sparse;
     }