pcsx2/pcsx2/GS/GSState.cpp
2025-08-16 03:22:27 +02:00

5271 lines
163 KiB
C++

// SPDX-FileCopyrightText: 2002-2025 PCSX2 Dev Team
// SPDX-License-Identifier: GPL-3.0+
#include "GS/GSState.h"
#include "GS/GSDump.h"
#include "GS/GSGL.h"
#include "GS/GSPerfMon.h"
#include "GS/GSUtil.h"
#include "common/Console.h"
#include "common/BitUtils.h"
#include "common/Path.h"
#include "common/StringUtil.h"
#include <algorithm>
#include <cfloat>
#include <fstream>
#include <iomanip>
#include <bit>
int GSState::s_n = 0;
int GSState::s_last_transfer_draw_n = 0;
int GSState::s_transfer_n = 0;
static __fi bool IsAutoFlushEnabled()
{
return GSIsHardwareRenderer() ? (GSConfig.UserHacks_AutoFlush != GSHWAutoFlushLevel::Disabled) : GSConfig.AutoFlushSW;
}
constexpr int GSState::GetSaveStateSize(int version)
{
int size = 0;
size += sizeof(STATE_VERSION);
size += sizeof(m_env.PRIM);
size += sizeof(m_env.PRMODECONT);
size += sizeof(m_env.TEXCLUT);
size += sizeof(m_env.SCANMSK);
size += sizeof(m_env.TEXA);
size += sizeof(m_env.FOGCOL);
size += sizeof(m_env.DIMX);
size += sizeof(m_env.DTHE);
size += sizeof(m_env.COLCLAMP);
size += sizeof(m_env.PABE);
size += sizeof(m_env.BITBLTBUF);
size += sizeof(m_env.TRXDIR);
size += sizeof(m_env.TRXPOS);
size += sizeof(m_env.TRXREG);
size += sizeof(m_env.TRXREG); // obsolete
for (int i = 0; i < 2; i++)
{
size += sizeof(m_env.CTXT[i].XYOFFSET);
size += sizeof(m_env.CTXT[i].TEX0);
size += sizeof(m_env.CTXT[i].TEX1);
size += sizeof(m_env.CTXT[i].CLAMP);
size += sizeof(m_env.CTXT[i].MIPTBP1);
size += sizeof(m_env.CTXT[i].MIPTBP2);
size += sizeof(m_env.CTXT[i].SCISSOR);
size += sizeof(m_env.CTXT[i].ALPHA);
size += sizeof(m_env.CTXT[i].TEST);
size += sizeof(m_env.CTXT[i].FBA);
size += sizeof(m_env.CTXT[i].FRAME);
size += sizeof(m_env.CTXT[i].ZBUF);
}
size += sizeof(m_v.RGBAQ);
size += sizeof(m_v.ST);
size += sizeof(m_v.UV);
size += sizeof(m_v.FOG);
size += sizeof(m_v.XYZ);
size += sizeof(GIFReg); // obsolete
size += sizeof(m_tr.x);
size += sizeof(m_tr.y);
if (version >= 9)
{
size += sizeof(m_tr.w);
size += sizeof(m_tr.h);
size += sizeof(m_tr.m_blit);
size += sizeof(m_tr.m_pos);
size += sizeof(m_tr.m_reg);
size += sizeof(m_tr.rect);
size += sizeof(m_tr.total);
size += sizeof(m_tr.start);
size += sizeof(m_tr.end);
size += sizeof(m_tr.write);
}
size += GSLocalMemory::m_vmsize;
size += (sizeof(GIFPath::tag) + sizeof(GIFPath::reg)) * 4 /* std::size(GSState::m_path) */; // std::size won't work without an instance.
size += sizeof(m_q);
return size;
}
GSState::GSState()
: m_vt(this)
{
// m_nativeres seems to be a hack. Unfortunately it impacts draw call number which make debug painful in the replayer.
// Let's keep it disabled to ease debug.
m_nativeres = GSConfig.UpscaleMultiplier == 1.0f;
m_mipmap = GSConfig.Mipmap;
s_n = 0;
s_transfer_n = 0;
memset(&m_v, 0, sizeof(m_v));
memset(&m_vertex, 0, sizeof(m_vertex));
memset(&m_index, 0, sizeof(m_index));
memset(m_mem.m_vm8, 0, m_mem.m_vmsize);
m_v.RGBAQ.Q = 1.0f;
GrowVertexBuffer();
PRIM = &m_env.PRIM;
//CSR->rREV = 0x20;
m_env.PRMODECONT.AC = 1;
Reset(false);
ResetHandlers();
}
GSState::~GSState()
{
if (m_vertex.buff)
_aligned_free(m_vertex.buff);
if (m_vertex.buff_copy)
_aligned_free(m_vertex.buff_copy);
if (m_index.buff)
_aligned_free(m_index.buff);
if (m_draw_vertex.buff)
_aligned_free(m_draw_vertex.buff);
if (m_draw_index.buff)
_aligned_free(m_draw_index.buff);
}
std::string GSState::GetDrawDumpPath(const char* format, ...)
{
std::va_list ap;
va_start(ap, format);
const std::string& base = GSIsHardwareRenderer() ? GSConfig.HWDumpDirectory : GSConfig.SWDumpDirectory;
std::string ret(Path::Combine(base, StringUtil::StdStringFromFormatV(format, ap)));
va_end(ap);
return ret;
}
void GSState::Reset(bool hardware_reset)
{
Flush(GSFlushReason::RESET);
// FIXME: bios logo not shown cut in half after reset, missing graphics in GoW after first FMV
memset(&m_path, 0, sizeof(m_path));
memset(&m_v, 0, sizeof(m_v));
m_env.Reset();
m_mem.m_clut.Reset();
PRIM = &m_env.PRIM;
UpdateContext();
UpdateVertexKick();
for (u32 i = 0; i < 2; i++)
{
m_env.CTXT[i].UpdateScissor();
// What is this nonsense? Basically, GOW does a 32x448 draw after resetting the GS, thinking the PSM for the framebuffer is going
// to be set to C24, therefore the alpha bits get left alone. Because of the reset, in PCSX2, it ends up as C32, and the TC gets
// confused, leading to a later texture load using this render target instead of local memory. It's a problem because the game
// uploads texture data on startup to the beginning of VRAM, and never overwrites it.
//
// In the software renderer, if we let the draw happen, it gets scissored to 1x1 (because the scissor is inclusive of the
// upper bounds). This doesn't seem to destroy the chest texture, presumably it's further out in memory.
//
// Hardware test show that VRAM gets corrupted on CSR reset, but the first page remains intact. We're guessing this has something
// to do with DRAM refresh, and perhaps the internal counters used for refresh also getting reset. We're obviously not going
// to emulate this, but to work around the aforementioned issue, in the hardware renderers, we set the scissor to an out of
// bounds value. This means that draws get skipped until the game sets a proper scissor up, which is definitely going to happen
// after reset (otherwise it'd only ever render 1x1).
//
if (!hardware_reset && GSIsHardwareRenderer())
m_env.CTXT[i].scissor.cull = GSVector4i::xffffffff();
m_env.CTXT[i].offset.fb = m_mem.GetOffset(m_env.CTXT[i].FRAME.Block(), m_env.CTXT[i].FRAME.FBW, m_env.CTXT[i].FRAME.PSM);
m_env.CTXT[i].offset.zb = m_mem.GetOffset(m_env.CTXT[i].ZBUF.Block(), m_env.CTXT[i].FRAME.FBW, m_env.CTXT[i].ZBUF.PSM);
m_env.CTXT[i].offset.fzb4 = m_mem.GetPixelOffset4(m_env.CTXT[i].FRAME, m_env.CTXT[i].ZBUF);
}
UpdateScissor();
m_vertex.head = 0;
m_vertex.tail = 0;
m_vertex.next = 0;
m_index.tail = 0;
m_scanmask_used = 0;
m_texflush_flag = false;
m_channel_shuffle = false;
m_dirty_gs_regs = 0;
m_backed_up_ctx = -1;
memcpy(&m_prev_env, &m_env, sizeof(m_prev_env));
}
template<bool auto_flush>
void GSState::SetPrimHandlers()
{
#define SetHandlerXYZ(P, auto_flush) \
m_fpGIFPackedRegHandlerXYZ[P][0] = &GSState::GIFPackedRegHandlerXYZF2<P, 0, auto_flush>; \
m_fpGIFPackedRegHandlerXYZ[P][1] = &GSState::GIFPackedRegHandlerXYZF2<P, 1, auto_flush>; \
m_fpGIFPackedRegHandlerXYZ[P][2] = &GSState::GIFPackedRegHandlerXYZ2<P, 0, auto_flush>; \
m_fpGIFPackedRegHandlerXYZ[P][3] = &GSState::GIFPackedRegHandlerXYZ2<P, 1, auto_flush>; \
m_fpGIFRegHandlerXYZ[P][0] = &GSState::GIFRegHandlerXYZF2<P, 0, auto_flush>; \
m_fpGIFRegHandlerXYZ[P][1] = &GSState::GIFRegHandlerXYZF2<P, 1, auto_flush>; \
m_fpGIFRegHandlerXYZ[P][2] = &GSState::GIFRegHandlerXYZ2<P, 0, auto_flush>; \
m_fpGIFRegHandlerXYZ[P][3] = &GSState::GIFRegHandlerXYZ2<P, 1, auto_flush>; \
m_fpGIFPackedRegHandlerSTQRGBAXYZF2[P] = &GSState::GIFPackedRegHandlerSTQRGBAXYZF2<P, auto_flush>; \
m_fpGIFPackedRegHandlerSTQRGBAXYZ2[P] = &GSState::GIFPackedRegHandlerSTQRGBAXYZ2<P, auto_flush>;
SetHandlerXYZ(GS_POINTLIST, true);
SetHandlerXYZ(GS_LINELIST, auto_flush);
SetHandlerXYZ(GS_LINESTRIP, auto_flush);
SetHandlerXYZ(GS_TRIANGLELIST, auto_flush);
SetHandlerXYZ(GS_TRIANGLESTRIP, auto_flush);
SetHandlerXYZ(GS_TRIANGLEFAN, auto_flush);
SetHandlerXYZ(GS_SPRITE, auto_flush);
SetHandlerXYZ(GS_INVALID, auto_flush);
#undef SetHandlerXYZ
}
void GSState::ResetHandlers()
{
std::fill(std::begin(m_fpGIFPackedRegHandlers), std::end(m_fpGIFPackedRegHandlers), &GSState::GIFPackedRegHandlerNull);
m_fpGIFPackedRegHandlers[GIF_REG_PRIM] = (GIFPackedRegHandler)(GIFRegHandler)&GSState::GIFRegHandlerPRIM;
m_fpGIFPackedRegHandlers[GIF_REG_RGBA] = &GSState::GIFPackedRegHandlerRGBA;
m_fpGIFPackedRegHandlers[GIF_REG_STQ] = &GSState::GIFPackedRegHandlerSTQ;
m_fpGIFPackedRegHandlers[GIF_REG_UV] = GSConfig.UserHacks_ForceEvenSpritePosition ? &GSState::GIFPackedRegHandlerUV_Hack : &GSState::GIFPackedRegHandlerUV;
m_fpGIFPackedRegHandlers[GIF_REG_TEX0_1] = (GIFPackedRegHandler)(GIFRegHandler)&GSState::GIFRegHandlerTEX0<0>;
m_fpGIFPackedRegHandlers[GIF_REG_TEX0_2] = (GIFPackedRegHandler)(GIFRegHandler)&GSState::GIFRegHandlerTEX0<1>;
m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_1] = (GIFPackedRegHandler)(GIFRegHandler)&GSState::GIFRegHandlerCLAMP<0>;
m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_2] = (GIFPackedRegHandler)(GIFRegHandler)&GSState::GIFRegHandlerCLAMP<1>;
m_fpGIFPackedRegHandlers[GIF_REG_FOG] = &GSState::GIFPackedRegHandlerFOG;
m_fpGIFPackedRegHandlers[GIF_REG_A_D] = &GSState::GIFPackedRegHandlerA_D;
m_fpGIFPackedRegHandlers[GIF_REG_NOP] = &GSState::GIFPackedRegHandlerNOP;
if (IsAutoFlushEnabled())
SetPrimHandlers<true>();
else
SetPrimHandlers<false>();
std::fill(std::begin(m_fpGIFRegHandlers), std::end(m_fpGIFRegHandlers), &GSState::GIFRegHandlerNull);
m_fpGIFRegHandlers[GIF_A_D_REG_PRIM] = &GSState::GIFRegHandlerPRIM;
m_fpGIFRegHandlers[GIF_A_D_REG_RGBAQ] = &GSState::GIFRegHandlerRGBAQ;
m_fpGIFRegHandlers[GIF_A_D_REG_RGBAQ + 0x10] = &GSState::GIFRegHandlerRGBAQ;
m_fpGIFRegHandlers[GIF_A_D_REG_ST] = &GSState::GIFRegHandlerST;
m_fpGIFRegHandlers[GIF_A_D_REG_UV] = GSConfig.UserHacks_ForceEvenSpritePosition ? &GSState::GIFRegHandlerUV_Hack : &GSState::GIFRegHandlerUV;
m_fpGIFRegHandlers[GIF_A_D_REG_TEX0_1] = &GSState::GIFRegHandlerTEX0<0>;
m_fpGIFRegHandlers[GIF_A_D_REG_TEX0_2] = &GSState::GIFRegHandlerTEX0<1>;
m_fpGIFRegHandlers[GIF_A_D_REG_CLAMP_1] = &GSState::GIFRegHandlerCLAMP<0>;
m_fpGIFRegHandlers[GIF_A_D_REG_CLAMP_2] = &GSState::GIFRegHandlerCLAMP<1>;
m_fpGIFRegHandlers[GIF_A_D_REG_FOG] = &GSState::GIFRegHandlerFOG;
m_fpGIFRegHandlers[GIF_A_D_REG_NOP] = &GSState::GIFRegHandlerNOP;
m_fpGIFRegHandlers[GIF_A_D_REG_TEX1_1] = &GSState::GIFRegHandlerTEX1<0>;
m_fpGIFRegHandlers[GIF_A_D_REG_TEX1_2] = &GSState::GIFRegHandlerTEX1<1>;
m_fpGIFRegHandlers[GIF_A_D_REG_TEX2_1] = &GSState::GIFRegHandlerTEX2<0>;
m_fpGIFRegHandlers[GIF_A_D_REG_TEX2_2] = &GSState::GIFRegHandlerTEX2<1>;
m_fpGIFRegHandlers[GIF_A_D_REG_XYOFFSET_1] = &GSState::GIFRegHandlerXYOFFSET<0>;
m_fpGIFRegHandlers[GIF_A_D_REG_XYOFFSET_2] = &GSState::GIFRegHandlerXYOFFSET<1>;
m_fpGIFRegHandlers[GIF_A_D_REG_PRMODECONT] = &GSState::GIFRegHandlerPRMODECONT;
m_fpGIFRegHandlers[GIF_A_D_REG_PRMODE] = &GSState::GIFRegHandlerPRMODE;
m_fpGIFRegHandlers[GIF_A_D_REG_TEXCLUT] = &GSState::GIFRegHandlerTEXCLUT;
m_fpGIFRegHandlers[GIF_A_D_REG_SCANMSK] = &GSState::GIFRegHandlerSCANMSK;
m_fpGIFRegHandlers[GIF_A_D_REG_MIPTBP1_1] = &GSState::GIFRegHandlerMIPTBP1<0>;
m_fpGIFRegHandlers[GIF_A_D_REG_MIPTBP1_2] = &GSState::GIFRegHandlerMIPTBP1<1>;
m_fpGIFRegHandlers[GIF_A_D_REG_MIPTBP2_1] = &GSState::GIFRegHandlerMIPTBP2<0>;
m_fpGIFRegHandlers[GIF_A_D_REG_MIPTBP2_2] = &GSState::GIFRegHandlerMIPTBP2<1>;
m_fpGIFRegHandlers[GIF_A_D_REG_TEXA] = &GSState::GIFRegHandlerTEXA;
m_fpGIFRegHandlers[GIF_A_D_REG_FOGCOL] = &GSState::GIFRegHandlerFOGCOL;
m_fpGIFRegHandlers[GIF_A_D_REG_TEXFLUSH] = &GSState::GIFRegHandlerTEXFLUSH;
m_fpGIFRegHandlers[GIF_A_D_REG_SCISSOR_1] = &GSState::GIFRegHandlerSCISSOR<0>;
m_fpGIFRegHandlers[GIF_A_D_REG_SCISSOR_2] = &GSState::GIFRegHandlerSCISSOR<1>;
m_fpGIFRegHandlers[GIF_A_D_REG_ALPHA_1] = &GSState::GIFRegHandlerALPHA<0>;
m_fpGIFRegHandlers[GIF_A_D_REG_ALPHA_2] = &GSState::GIFRegHandlerALPHA<1>;
m_fpGIFRegHandlers[GIF_A_D_REG_DIMX] = &GSState::GIFRegHandlerDIMX;
m_fpGIFRegHandlers[GIF_A_D_REG_DTHE] = &GSState::GIFRegHandlerDTHE;
m_fpGIFRegHandlers[GIF_A_D_REG_COLCLAMP] = &GSState::GIFRegHandlerCOLCLAMP;
m_fpGIFRegHandlers[GIF_A_D_REG_TEST_1] = &GSState::GIFRegHandlerTEST<0>;
m_fpGIFRegHandlers[GIF_A_D_REG_TEST_2] = &GSState::GIFRegHandlerTEST<1>;
m_fpGIFRegHandlers[GIF_A_D_REG_PABE] = &GSState::GIFRegHandlerPABE;
m_fpGIFRegHandlers[GIF_A_D_REG_FBA_1] = &GSState::GIFRegHandlerFBA<0>;
m_fpGIFRegHandlers[GIF_A_D_REG_FBA_2] = &GSState::GIFRegHandlerFBA<1>;
m_fpGIFRegHandlers[GIF_A_D_REG_FRAME_1] = &GSState::GIFRegHandlerFRAME<0>;
m_fpGIFRegHandlers[GIF_A_D_REG_FRAME_2] = &GSState::GIFRegHandlerFRAME<1>;
m_fpGIFRegHandlers[GIF_A_D_REG_ZBUF_1] = &GSState::GIFRegHandlerZBUF<0>;
m_fpGIFRegHandlers[GIF_A_D_REG_ZBUF_2] = &GSState::GIFRegHandlerZBUF<1>;
m_fpGIFRegHandlers[GIF_A_D_REG_BITBLTBUF] = &GSState::GIFRegHandlerBITBLTBUF;
m_fpGIFRegHandlers[GIF_A_D_REG_TRXPOS] = &GSState::GIFRegHandlerTRXPOS;
m_fpGIFRegHandlers[GIF_A_D_REG_TRXREG] = &GSState::GIFRegHandlerTRXREG;
m_fpGIFRegHandlers[GIF_A_D_REG_TRXDIR] = &GSState::GIFRegHandlerTRXDIR;
m_fpGIFRegHandlers[GIF_A_D_REG_HWREG] = &GSState::GIFRegHandlerHWREG;
m_fpGIFRegHandlers[GIF_A_D_REG_SIGNAL] = &GSState::GIFRegHandlerNull;
m_fpGIFRegHandlers[GIF_A_D_REG_FINISH] = &GSState::GIFRegHandlerNull;
m_fpGIFRegHandlers[GIF_A_D_REG_LABEL] = &GSState::GIFRegHandlerNull;
}
void GSState::ResetPCRTC()
{
PCRTCDisplays.SetVideoMode(GetVideoMode());
PCRTCDisplays.EnableDisplays(m_regs->PMODE, m_regs->SMODE2, isReallyInterlaced());
PCRTCDisplays.SetRects(0, m_regs->DISP[0].DISPLAY, m_regs->DISP[0].DISPFB);
PCRTCDisplays.SetRects(1, m_regs->DISP[1].DISPLAY, m_regs->DISP[1].DISPFB);
}
void GSState::UpdateSettings(const Pcsx2Config::GSOptions& old_config)
{
m_mipmap = GSConfig.Mipmap;
if (
GSConfig.AutoFlushSW != old_config.AutoFlushSW ||
GSConfig.UserHacks_AutoFlush != old_config.UserHacks_AutoFlush ||
GSConfig.UserHacks_ForceEvenSpritePosition != old_config.UserHacks_ForceEvenSpritePosition)
{
ResetHandlers();
}
}
bool GSState::isinterlaced()
{
return !!m_regs->SMODE2.INT;
}
bool GSState::isReallyInterlaced()
{
// The FIELD register only flips if the CMOD field in SMODE1 is set to anything but 0 and Front Porch bottom bit in SYNCV is set.
return (m_regs->SYNCV.VFP & 0x1) && m_regs->SMODE1.CMOD;
}
GSVideoMode GSState::GetVideoMode()
{
// TODO: Get confirmation of videomode from SYSCALL ? not necessary but would be nice.
// Other videomodes can't be detected on our side without the help of the data from core
// You can only identify a limited number of video modes based on the info from CRTC registers.
const u8 Colorburst = m_regs->SMODE1.CMOD; // Subcarrier frequency
const u8 PLL_Divider = m_regs->SMODE1.LC; // Phased lock loop divider
switch (Colorburst)
{
case 0:
if (isinterlaced() && PLL_Divider == 22)
return GSVideoMode::HDTV_1080I;
else if (!isinterlaced() && PLL_Divider == 22)
return GSVideoMode::HDTV_720P;
else if (!isinterlaced() && PLL_Divider == 32)
return GSVideoMode::SDTV_480P; // TODO: 576P will also be reported as 480P, find some way to differeniate.
else
return GSVideoMode::VESA;
case 2:
return GSVideoMode::NTSC;
case 3:
return GSVideoMode::PAL;
default:
return GSVideoMode::Unknown;
}
ASSUME(0); // unreachable
}
float GSState::GetTvRefreshRate()
{
const GSVideoMode videomode = GetVideoMode();
//TODO: Check vertical frequencies for VESA video modes, old ones were untested.
switch (videomode)
{
case GSVideoMode::NTSC:
case GSVideoMode::SDTV_480P:
return (60 / 1.001f);
case GSVideoMode::PAL:
return 50;
case GSVideoMode::HDTV_720P:
case GSVideoMode::HDTV_1080I:
return 60;
default:
Console.Error("GS: Unknown video mode. Please report: https://github.com/PCSX2/pcsx2/issues");
return 0;
}
ASSUME(0); // unreachable
}
const char* GSState::GetFlushReasonString(GSFlushReason reason)
{
switch (reason)
{
case GSFlushReason::RESET:
return "RESET";
case GSFlushReason::CONTEXTCHANGE:
return "CONTEXT CHANGE";
case GSFlushReason::CLUTCHANGE:
return "CLUT CHANGE (RELOAD REQ)";
case GSFlushReason::GSTRANSFER:
return "GS TRANSFER";
case GSFlushReason::UPLOADDIRTYTEX:
return "GS UPLOAD OVERWRITES CURRENT TEXTURE OR CLUT";
case GSFlushReason::UPLOADDIRTYFRAME:
return "GS UPLOAD OVERWRITES CURRENT FRAME BUFFER";
case GSFlushReason::UPLOADDIRTYZBUF:
return "GS UPLOAD OVERWRITES CURRENT ZBUFFER";
case GSFlushReason::LOCALTOLOCALMOVE:
return "GS LOCAL TO LOCAL OVERWRITES CURRENT TEXTURE OR CLUT";
case GSFlushReason::DOWNLOADFIFO:
return "DOWNLOAD FIFO";
case GSFlushReason::SAVESTATE:
return "SAVESTATE";
case GSFlushReason::LOADSTATE:
return "LOAD SAVESTATE";
case GSFlushReason::AUTOFLUSH:
return "AUTOFLUSH OVERLAP DETECTED";
case GSFlushReason::VSYNC:
return "VSYNC";
case GSFlushReason::GSREOPEN:
return "GS REOPEN";
case GSFlushReason::UNKNOWN:
default:
return "UNKNOWN";
}
}
void GSState::DumpVertices(const std::string& filename)
{
std::ofstream file(filename);
if (!file.is_open())
return;
file << "FLUSH REASON: " << GetFlushReasonString(m_state_flush_reason);
if (m_state_flush_reason != GSFlushReason::CONTEXTCHANGE && m_dirty_gs_regs)
file << " AND POSSIBLE CONTEXT CHANGE";
file << std::endl << std::endl;
const u32 count = m_index.tail;
GSVertex* buffer = &m_vertex.buff[0];
const char* DEL = ", ";
file << "VERTEX COORDS (XYZ)" << std::endl;
file << std::fixed << std::setprecision(4);
for (u32 i = 0; i < count; ++i)
{
file << "\t" << std::dec << "v" << i << ": ";
GSVertex v = buffer[m_index.buff[i]];
const float x = (v.XYZ.X - (int)m_context->XYOFFSET.OFX) / 16.0f;
const float y = (v.XYZ.Y - (int)m_context->XYOFFSET.OFY) / 16.0f;
file << x << DEL;
file << y << DEL;
file << v.XYZ.Z;
file << std::endl;
}
file << std::endl;
file << "VERTEX COLOR (RGBA)" << std::endl;
file << std::fixed << std::setprecision(6);
for (u32 i = 0; i < count; ++i)
{
file << "\t" << std::dec << "v" << i << ": ";
GSVertex v = buffer[m_index.buff[i]];
file << std::setfill('0') << std::setw(3) << unsigned(v.RGBAQ.R) << DEL;
file << std::setfill('0') << std::setw(3) << unsigned(v.RGBAQ.G) << DEL;
file << std::setfill('0') << std::setw(3) << unsigned(v.RGBAQ.B) << DEL;
file << std::setfill('0') << std::setw(3) << unsigned(v.RGBAQ.A) << DEL;
file << "FOG: " << std::setfill('0') << std::setw(3) << unsigned(v.FOG);
file << std::endl;
}
file << std::endl;
const bool use_uv = PRIM->FST;
const std::string qualifier = use_uv ? "UV" : "STQ";
file << "TEXTURE COORDS (" << qualifier << ")" << std::endl;;
for (u32 i = 0; i < count; ++i)
{
file << "\t" << "v" << std::dec << i << ": ";
const GSVertex v = buffer[m_index.buff[i]];
// note
// Yes, technically as far as the GS is concerned Q belongs
// to RGBAQ. However, the purpose of this dump is to print
// our data in a more human readable format and typically Q
// is associated with STQ.
if (use_uv)
{
const float uv_U = v.U / 16.0f;
const float uv_V = v.V / 16.0f;
file << uv_U << DEL << uv_V;
}
else
{
float x = (v.ST.S / v.RGBAQ.Q) * (1 << m_context->TEX0.TW);
float y = (v.ST.T / v.RGBAQ.Q) * (1 << m_context->TEX0.TH);
file << v.ST.S << "(" << std::hex << std::bit_cast<u32>(v.ST.S) << ")" << DEL << v.ST.T << "(" << std::hex << std::bit_cast<u32>(v.ST.T) << ")" << DEL << v.RGBAQ.Q << "(" << std::hex << std::bit_cast<u32>(v.RGBAQ.Q) << ") - " << x << "," << y;
}
file << std::endl;
}
file << std::endl;
file << "TRACER" << std::dec << std::endl;
GSVector4i v = m_vt.m_min.c;
file << "\tmin c (r,g,b,a): " << v.x << DEL << v.y << DEL << v.z << DEL << v.w << std::endl;
v = m_vt.m_max.c;
file << "\tmax c (r,g,b,a): " << v.x << DEL << v.y << DEL << v.z << DEL << v.w << std::endl;
GSVector4 v2 = m_vt.m_min.p;
file << "\tmin p (x,y,z,f): " << v2.x << DEL << v2.y << DEL << v2.z << DEL << (u32)v2.w << std::endl;
v2 = m_vt.m_max.p;
file << "\tmax p (x,y,z,f): " << v2.x << DEL << v2.y << DEL << v2.z << DEL << (u32)v2.w << std::endl;
v2 = m_vt.m_min.t;
file << "\tmin t (u,v,q): " << v2.x << DEL << v2.y << DEL << v2.z << std::endl;
v2 = m_vt.m_max.t;
file << "\tmax t (u,v,q): " << v2.x << DEL << v2.y << DEL << v2.z << std::endl;
file << std::endl;
file << "\teq c (r,g,b,a): " << (m_vt.m_eq.r & 1) << DEL << (m_vt.m_eq.g & 1) << DEL << (m_vt.m_eq.b & 1) << DEL << (m_vt.m_eq.a & 1) << std::endl;
file << "\teq p (x,y,z,f): " << (m_vt.m_eq.x & 1) << DEL << (m_vt.m_eq.y & 1) << DEL << (m_vt.m_eq.z & 1) << DEL << (m_vt.m_eq.f & 1) << std::endl;
file << "\teq t (u,v,q) : " << (m_vt.m_eq.s & 1) << DEL << (m_vt.m_eq.t & 1) << DEL << (m_vt.m_eq.q & 1) << std::endl;
file.close();
}
__inline void GSState::CheckFlushes()
{
if (m_dirty_gs_regs && m_index.tail > 0)
{
if (TestDrawChanged())
Flush(GSFlushReason::CONTEXTCHANGE);
}
}
void GSState::GIFPackedRegHandlerNull(const GIFPackedReg* RESTRICT r)
{
}
void GSState::GIFPackedRegHandlerRGBA(const GIFPackedReg* RESTRICT r)
{
const GSVector4i mask = GSVector4i::load(0x0c080400);
const GSVector4i v = GSVector4i::load<false>(r).shuffle8(mask);
m_v.RGBAQ.U32[0] = (u32)GSVector4i::store(v);
m_v.RGBAQ.Q = m_q;
}
void GSState::GIFPackedRegHandlerSTQ(const GIFPackedReg* RESTRICT r)
{
const GSVector4i st = GSVector4i::loadl(&r->U64[0]);
GSVector4i q = GSVector4i::loadl(&r->U64[1]);
GSVector4i::storel(&m_v.ST, st);
// Vexx (character shadow)
// q = 0 (st also 0 on the first 16 vertices), setting it to 1.0f to avoid div by zero later
q = q.blend8(GSVector4i::cast(GSVector4(FLT_MIN)), q == GSVector4i::zero());
// Suikoden 4
// creates some nan for Q. Let's avoid undefined behavior (See GIFRegHandlerRGBAQ)
q = GSVector4i::cast(GSVector4::cast(q).replace_nan(GSVector4::m_max));
GSVector4::store(&m_q, GSVector4::cast(q));
// hide behind a define for now to avoid spam in the above cases for users
#if defined(PCSX2_DEVBUILD) || defined(_DEBUG)
if (std::isnan(m_v.ST.S) || std::isnan(m_v.ST.T))
Console.Warning("S or T is nan");
#endif
}
void GSState::GIFPackedRegHandlerUV(const GIFPackedReg* RESTRICT r)
{
const GSVector4i v = GSVector4i::loadl(r) & GSVector4i::x00003fff();
m_v.UV = (u32)GSVector4i::store(v.ps32(v));
}
void GSState::GIFPackedRegHandlerUV_Hack(const GIFPackedReg* RESTRICT r)
{
const GSVector4i v = GSVector4i::loadl(r) & GSVector4i::x00003fff();
m_v.UV = (u32)GSVector4i::store(v.ps32(v));
m_isPackedUV_HackFlag = true;
}
template <u32 prim, u32 adc, bool auto_flush>
void GSState::GIFPackedRegHandlerXYZF2(const GIFPackedReg* RESTRICT r)
{
const bool skip = adc || r->XYZF2.Skip();
if (!skip || GSUtil::GetPrimClass(m_prev_env.PRIM.PRIM) != GSUtil::GetPrimClass(m_env.PRIM.PRIM) || (m_dirty_gs_regs & (1 << DIRTY_REG_XYOFFSET)))
CheckFlushes();
GSVector4i xy = GSVector4i::loadnt(r);
GSVector4i zf = xy.zwzw();
xy = xy.upl16(xy.srl<4>()).upl32(GSVector4i::load((int)m_v.UV));
zf = zf.srl32<4>() & GSVector4i::x00ffffff().upl32(GSVector4i::x000000ff());
m_v.m[1] = xy.upl32(zf);
VertexKick<prim, auto_flush>(skip);
}
template <u32 prim, u32 adc, bool auto_flush>
void GSState::GIFPackedRegHandlerXYZ2(const GIFPackedReg* RESTRICT r)
{
const bool skip = adc || r->XYZ2.Skip();
if (!skip || GSUtil::GetPrimClass(m_prev_env.PRIM.PRIM) != GSUtil::GetPrimClass(m_env.PRIM.PRIM) || (m_dirty_gs_regs & (1 << DIRTY_REG_XYOFFSET)))
CheckFlushes();
const GSVector4i xy = GSVector4i::loadnt(r);
const GSVector4i z = xy.zzzz();
const GSVector4i xyz = xy.upl16(xy.srl<4>()).upl32(z);
m_v.m[1] = xyz.upl64(GSVector4i::loadl(&m_v.UV));
VertexKick<prim, auto_flush>(skip);
}
void GSState::GIFPackedRegHandlerFOG(const GIFPackedReg* RESTRICT r)
{
m_v.FOG = r->FOG.F;
}
void GSState::GIFPackedRegHandlerA_D(const GIFPackedReg* RESTRICT r)
{
(this->*m_fpGIFRegHandlers[r->A_D.ADDR & 0x7F])(&r->r);
}
void GSState::GIFPackedRegHandlerNOP(const GIFPackedReg* RESTRICT r)
{
}
template <u32 prim, bool auto_flush>
void GSState::GIFPackedRegHandlerSTQRGBAXYZF2(const GIFPackedReg* RESTRICT r, u32 size)
{
pxAssert(size > 0 && size % 3 == 0);
CheckFlushes();
const GIFPackedReg* RESTRICT r_end = r + size;
while (r < r_end)
{
const GSVector4i st = GSVector4i::loadl(&r[0].U64[0]);
GSVector4i q = GSVector4i::loadl(&r[0].U64[1]);
const GSVector4i rgba = (GSVector4i::load<false>(&r[1]) & GSVector4i::x000000ff()).ps32().pu16();
q = q.blend8(GSVector4i::cast(GSVector4(FLT_MIN)), q == GSVector4i::zero()); // see GIFPackedRegHandlerSTQ
m_v.m[0] = st.upl64(rgba.upl32(q)); // TODO: only store the last one
GSVector4i xy = GSVector4i::loadl(&r[2].U64[0]);
GSVector4i zf = GSVector4i::loadl(&r[2].U64[1]);
xy = xy.upl16(xy.srl<4>()).upl32(GSVector4i::load((int)m_v.UV));
zf = zf.srl32<4>() & GSVector4i::x00ffffff().upl32(GSVector4i::x000000ff());
m_v.m[1] = xy.upl32(zf); // TODO: only store the last one
VertexKick<prim, auto_flush>(r[2].XYZF2.Skip());
r += 3;
}
m_q = r[-3].STQ.Q; // remember the last one, STQ outputs this to the temp Q each time
}
template <u32 prim, bool auto_flush>
void GSState::GIFPackedRegHandlerSTQRGBAXYZ2(const GIFPackedReg* RESTRICT r, u32 size)
{
pxAssert(size > 0 && size % 3 == 0);
CheckFlushes();
const GIFPackedReg* RESTRICT r_end = r + size;
while (r < r_end)
{
const GSVector4i st = GSVector4i::loadl(&r[0].U64[0]);
GSVector4i q = GSVector4i::loadl(&r[0].U64[1]);
const GSVector4i rgba = (GSVector4i::load<false>(&r[1]) & GSVector4i::x000000ff()).ps32().pu16();
q = q.blend8(GSVector4i::cast(GSVector4(FLT_MIN)), q == GSVector4i::zero()); // see GIFPackedRegHandlerSTQ
m_v.m[0] = st.upl64(rgba.upl32(q)); // TODO: only store the last one
const GSVector4i xy = GSVector4i::loadl(&r[2].U64[0]);
const GSVector4i z = GSVector4i::loadl(&r[2].U64[1]);
const GSVector4i xyz = xy.upl16(xy.srl<4>()).upl32(z);
m_v.m[1] = xyz.upl64(GSVector4i::loadl(&m_v.UV)); // TODO: only store the last one
VertexKick<prim, auto_flush>(r[2].XYZ2.Skip());
r += 3;
}
m_q = r[-3].STQ.Q; // remember the last one, STQ outputs this to the temp Q each time
}
void GSState::GIFPackedRegHandlerNOP(const GIFPackedReg* RESTRICT r, u32 size)
{
}
void GSState::GIFRegHandlerNull(const GIFReg* RESTRICT r)
{
}
__forceinline void GSState::ApplyPRIM(u32 prim)
{
if (m_env.PRMODECONT.AC == 1)
{
m_env.PRIM.U32[0] = prim;
UpdateContext();
}
else
m_env.PRIM.PRIM = prim & 0x7;
if (m_prev_env.PRIM.U32[0] ^ m_env.PRIM.U32[0])
m_dirty_gs_regs |= (1 << DIRTY_REG_PRIM);
else
m_dirty_gs_regs &= ~(1<< DIRTY_REG_PRIM);
UpdateVertexKick();
pxAssert(m_index.tail == 0 || m_index.buff[m_index.tail - 1] + 1 == m_vertex.next);
if (m_index.tail == 0)
m_vertex.next = 0;
m_vertex.head = m_vertex.tail = m_vertex.next; // remove unused vertices from the end of the vertex buffer
}
void GSState::GIFRegHandlerPRIM(const GIFReg* RESTRICT r)
{
ALIGN_STACK(32);
ApplyPRIM(r->PRIM.U32[0]);
}
void GSState::GIFRegHandlerRGBAQ(const GIFReg* RESTRICT r)
{
const GSVector4i rgbaq = (GSVector4i)r->RGBAQ;
GSVector4i q = rgbaq.blend8(GSVector4i::cast(GSVector4::m_one), rgbaq == GSVector4i::zero()).yyyy(); // see GIFPackedRegHandlerSTQ
// Silent Hill output a nan in Q to emulate the flash light. Unfortunately it
// breaks GSVertexTrace code that rely on min/max.
q = GSVector4i::cast(GSVector4::cast(q).replace_nan(GSVector4::m_max));
m_v.RGBAQ = rgbaq.upl32(q);
}
void GSState::GIFRegHandlerST(const GIFReg* RESTRICT r)
{
m_v.ST = r->ST;
#if defined(PCSX2_DEVBUILD) || defined(_DEBUG)
if (std::isnan(m_v.ST.S) || std::isnan(m_v.ST.T))
Console.Warning("S or T is nan");
#endif
}
void GSState::GIFRegHandlerUV(const GIFReg* RESTRICT r)
{
m_v.UV = r->UV.U32[0] & 0x3fff3fff;
}
void GSState::GIFRegHandlerUV_Hack(const GIFReg* RESTRICT r)
{
m_v.UV = r->UV.U32[0] & 0x3fff3fff;
m_isPackedUV_HackFlag = false;
}
template <u32 prim, u32 adc, bool auto_flush>
void GSState::GIFRegHandlerXYZF2(const GIFReg* RESTRICT r)
{
if (!adc || GSUtil::GetPrimClass(m_prev_env.PRIM.PRIM) != GSUtil::GetPrimClass(m_env.PRIM.PRIM) || (m_dirty_gs_regs & (1 << DIRTY_REG_XYOFFSET)))
CheckFlushes();
const GSVector4i xyzf = GSVector4i::loadl(&r->XYZF);
const GSVector4i xyz = xyzf & (GSVector4i::xffffffff().upl32(GSVector4i::x00ffffff()));
const GSVector4i uvf = GSVector4i::load((int)m_v.UV).upl32(xyzf.srl32<24>().srl<4>());
m_v.m[1] = xyz.upl64(uvf);
VertexKick<prim, auto_flush>(adc);
}
template <u32 prim, u32 adc, bool auto_flush>
void GSState::GIFRegHandlerXYZ2(const GIFReg* RESTRICT r)
{
if (!adc || GSUtil::GetPrimClass(m_prev_env.PRIM.PRIM) != GSUtil::GetPrimClass(m_env.PRIM.PRIM) || (m_dirty_gs_regs & (1 << DIRTY_REG_XYOFFSET)))
CheckFlushes();
m_v.m[1] = GSVector4i::load(&r->XYZ, &m_v.UV);
VertexKick<prim, auto_flush>(adc);
}
template <int i>
void GSState::ApplyTEX0(GIFRegTEX0& TEX0)
{
// TODO: Paletted Formats
// 8-bit and 4 bit formats need to be addressed with a buffer width divisible 2.
// However, not doing so is possible and does have a behavior on the GS.
// When implementing such code care must be taken not to apply it unless it is
// used for a draw. Galaxy Angel will send TEX0 with a PSM of T8 and a TBW of 7
// only to immediately update it to CT32 with TEX2. The old code used to apply a
// correction on the TEX0 setting which caused the game to draw the CT32 texture
// with an incorrect buffer width.
//
// Bouken Jidai Katsugeki Goemon apparently uses a TBW of 1 but this game is currently
// extremely broken for the same reasons as MLB Power Pros in that it spams TEX0 with
// complete garbage making for a nice 1G heap of GSOffset.
GL_REG("Apply TEX0_%d = 0x%x_%x", i, TEX0.U32[1], TEX0.U32[0]);
if ((TEX0.PSM & 0x7) >= 3 && m_mem.m_clut.CanLoadCLUT(TEX0))
{
m_mem.m_clut.ClearDrawInvalidity();
m_mem.m_clut.SetNextCLUTTEX0(TEX0.U64);
CheckCLUTValidity(m_prev_env.PRIM.PRIM);
}
// Even if TEX0 did not change, a new palette may have been uploaded and will overwrite the currently queued for drawing.
const bool wt = m_mem.m_clut.WriteTest(TEX0, m_env.TEXCLUT);
// No need to flush on CLUT if we aren't texture mapping.
if (wt)
{
if ((m_prev_env.PRIM.TME && (m_prev_env.CTXT[m_prev_env.PRIM.CTXT].TEX0.PSM & 0x7) >= 3) || (m_mem.m_clut.IsInvalid() & 2))
Flush(GSFlushReason::CLUTCHANGE);
else
FlushWrite();
// Abort any channel shuffle skipping, since this is likely part of a new shuffle.
// Test case: Tomb Raider series. This is gated by the CBP actually changing, because
// Urban Chaos writes to the memory backing the CLUT in the middle of a shuffle, and
// it's unclear whether the CLUT would actually get reloaded in that case.
if (TEX0.CBP != m_mem.m_clut.GetCLUTCBP())
m_channel_shuffle_abort = true;
}
TEX0.CPSM &= 0xa; // 1010b
m_env.CTXT[i].TEX0 = TEX0;
if (wt)
{
GIFRegBITBLTBUF BITBLTBUF = {};
GSVector4i r;
if (TEX0.CSM == 0)
{
BITBLTBUF.SBP = TEX0.CBP;
BITBLTBUF.SBW = 1;
BITBLTBUF.SPSM = TEX0.CPSM;
r.left = 0;
r.top = 0;
r.right = GSLocalMemory::m_psm[TEX0.CPSM].bs.x;
r.bottom = GSLocalMemory::m_psm[TEX0.CPSM].bs.y;
int blocks = 4;
if (GSLocalMemory::m_psm[TEX0.CPSM].trbpp == 16)
blocks >>= 1;
if (GSLocalMemory::m_psm[TEX0.PSM].trbpp == 4)
blocks >>= 1;
// Invalidating videomem is slow, so *only* do it when it's definitely a CLUT draw in HW mode.
for (int j = 0; j < blocks; j++, BITBLTBUF.SBP++)
InvalidateLocalMem(BITBLTBUF, r, true);
}
else
{
BITBLTBUF.SBP = TEX0.CBP;
BITBLTBUF.SBW = m_env.TEXCLUT.CBW;
BITBLTBUF.SPSM = TEX0.CPSM;
r.left = m_env.TEXCLUT.COU;
r.top = m_env.TEXCLUT.COV;
r.right = r.left + GSLocalMemory::m_psm[TEX0.CPSM].pal;
r.bottom = r.top + 1;
InvalidateLocalMem(BITBLTBUF, r, true);
}
m_mem.m_clut.Write(m_env.CTXT[i].TEX0, m_env.TEXCLUT);
}
u64 mask = 0x1fffffffffull; // TBP0 TBW PSM TW TH TCC TFX
if ((TEX0.PSM & 0x7) >= 3)
mask |= 0x1f78000000000000ull; // CPSM CSA
if (i == m_prev_env.PRIM.CTXT)
{
if (m_prev_env.CTXT[i].TEX0.TBP0 != m_env.CTXT[i].TEX0.TBP0)
m_texflush_flag = false;
if ((m_prev_env.CTXT[i].TEX0.U64 ^ m_env.CTXT[i].TEX0.U64) & mask)
m_dirty_gs_regs |= (1 << DIRTY_REG_TEX0);
else
m_dirty_gs_regs &= ~(1 << DIRTY_REG_TEX0);
}
}
template <int i>
void GSState::GIFRegHandlerTEX0(const GIFReg* RESTRICT r)
{
GL_REG("TEX0_%d = 0x%x_%x", i, r->U32[1], r->U32[0]);
GIFRegTEX0 TEX0 = r->TEX0;
// Max allowed MTBA size for 32bit swizzled textures (including 8H 4HL etc) is 512, 16bit and normal 8/4bit formats can be 1024
const u32 maxTex = (GSLocalMemory::m_psm[TEX0.PSM].bpp < 32) ? 10 : 9;
// Spec max is 10, but bitfield allows for up to 15
// However STQ calculations expect the written size to be used for denormalization (Simple 2000 Series Vol 105 The Maid)
// This is clamped to 10 in the FixedTEX0 functions so texture sizes don't exceed 1024x1024, but STQ can calculate properly (with invalid_tex0)
//
// Yakuza (minimap)
// Sets TW/TH to 0
// Drawn using solid colors, the texture is really a 1x1 white texel,
// modulated by the vertex color. Cannot change the dimension because S/T are normalized.
//
// Tokyo Xtreme Racer Drift 2 (text)
// Sets TW/TH to 0
// there used to be a case to force this to 10
// but GetSizeFixedTEX0 sorts this now
TEX0.TW = std::clamp<u32>(TEX0.TW, 0, 15);
TEX0.TH = std::clamp<u32>(TEX0.TH, 0, 15);
// MTBA loads are triggered by writes to TEX0 (but not TEX2!)
// Textures MUST be a minimum width of 32 pixels
// Format must be a color, Z formats do not trigger MTBA (but are valid for Mipmapping)
if (m_env.CTXT[i].TEX1.MTBA && TEX0.TW >= 5 && TEX0.TW <= maxTex && (TEX0.PSM & 0x30) != 0x30)
{
GIFRegMIPTBP1& mip_tbp1 = m_env.CTXT[i].MIPTBP1;
// NOTE 1: TEX1.MXL must not be automatically set to 3 here and it has no effect on MTBA.
// NOTE 2: Mipmap levels are packed with a minimum distance between them of 1 block, even down at 4bit textures under 16x16.
// NOTE 3: Everything is derrived from the width of the texture, TBW and TH are completely ignored (useful for handling non-rectangular ones)
// NOTE 4: Cartoon Network Racing's menu is VERY sensitive to this as it uses 4bit sized textures for the sky.
u32 bp = TEX0.TBP0;
u32 bw = std::max(1u, (1u << TEX0.TW) >> 6);
// Address is calculated as a 4bit address space, then converted (/8) to 32bit address space
// ((w * w * bpp) / 8) / 64. No the 'w' is not a typo ;)
const u32 bpp = GSLocalMemory::m_psm[TEX0.PSM].bpp >> 2;
u32 tex_size = ((1u << TEX0.TW) * (1u << TEX0.TW) * bpp) >> 9;
bp += tex_size;
bw = std::max<u32>(bw >> 1, 1);
tex_size = std::max<u32>(tex_size >> 2, 1);
mip_tbp1.TBP1 = bp;
mip_tbp1.TBW1 = bw;
bp += tex_size;
bw = std::max<u32>(bw >> 1, 1);
tex_size = std::max<u32>(tex_size >> 2, 1);
mip_tbp1.TBP2 = bp;
mip_tbp1.TBW2 = bw;
bp += tex_size;
bw = std::max<u32>(bw >> 1, 1);
mip_tbp1.TBP3 = bp;
mip_tbp1.TBW3 = bw;
if (i == m_prev_env.PRIM.CTXT)
{
if (m_prev_env.CTXT[i].MIPTBP1.U64 ^ mip_tbp1.U64)
m_dirty_gs_regs |= (1 << DIRTY_REG_MIPTBP1);
else
m_dirty_gs_regs &= ~(1 << DIRTY_REG_MIPTBP1);
}
}
ApplyTEX0<i>(TEX0);
}
template <int i>
void GSState::GIFRegHandlerCLAMP(const GIFReg* RESTRICT r)
{
GL_REG("CLAMP_%d = 0x%x_%x", i, r->U32[1], r->U32[0]);
m_env.CTXT[i].CLAMP = r->CLAMP;
if (i == m_prev_env.PRIM.CTXT)
{
if (m_prev_env.CTXT[i].CLAMP.U64 ^ m_env.CTXT[i].CLAMP.U64)
m_dirty_gs_regs |= (1 << DIRTY_REG_CLAMP);
else
m_dirty_gs_regs &= ~(1 << DIRTY_REG_CLAMP);
}
}
void GSState::GIFRegHandlerFOG(const GIFReg* RESTRICT r)
{
m_v.FOG = r->FOG.F;
}
void GSState::GIFRegHandlerNOP(const GIFReg* RESTRICT r)
{
}
template <int i>
void GSState::GIFRegHandlerTEX1(const GIFReg* RESTRICT r)
{
GL_REG("TEX1_%d = 0x%x_%x", i, r->U32[1], r->U32[0]);
m_env.CTXT[i].TEX1 = r->TEX1;
if (i == m_prev_env.PRIM.CTXT)
{
if (m_prev_env.CTXT[i].TEX1.U64 ^ m_env.CTXT[i].TEX1.U64)
m_dirty_gs_regs |= (1 << DIRTY_REG_TEX1);
else
m_dirty_gs_regs &= ~(1 << DIRTY_REG_TEX1);
}
}
template <int i>
void GSState::GIFRegHandlerTEX2(const GIFReg* RESTRICT r)
{
GL_REG("TEX2_%d = 0x%x_%x", i, r->U32[1], r->U32[0]);
// TEX2 is a masked write to TEX0, for performing CLUT swaps (palette swaps).
// It only applies the following fields:
// CLD, CSA, CSM, CPSM, CBP, PSM.
// It ignores these fields (uses existing values in the context):
// TFX, TCC, TH, TW, TBW, and TBP0
constexpr u64 mask = 0xFFFFFFE003F00000ull; // TEX2 bits
GIFRegTEX0 TEX0{};
TEX0.U64 = (m_env.CTXT[i].TEX0.U64 & ~mask) | (r->U64 & mask);
ApplyTEX0<i>(TEX0);
}
template <int i>
void GSState::GIFRegHandlerXYOFFSET(const GIFReg* RESTRICT r)
{
GL_REG("XYOFFSET_%d = 0x%x_%x", i, r->U32[1], r->U32[0]);
const u64 r_masked = r->U64 & 0x0000FFFF0000FFFFu;
if (i == m_prev_env.PRIM.CTXT)
{
if (m_prev_env.CTXT[i].XYOFFSET.U64 != r_masked)
m_dirty_gs_regs |= (1 << DIRTY_REG_XYOFFSET);
else
m_dirty_gs_regs &= ~(1 << DIRTY_REG_XYOFFSET);
}
if (m_env.CTXT[i].XYOFFSET.U64 == r_masked)
return;
m_env.CTXT[i].XYOFFSET.U64 = r_masked;
m_env.CTXT[i].UpdateScissor();
UpdateScissor();
}
void GSState::GIFRegHandlerPRMODECONT(const GIFReg* RESTRICT r)
{
GL_REG("PRMODECONT = 0x%x_%x", r->U32[1], r->U32[0]);
m_env.PRMODECONT.AC = r->PRMODECONT.AC;
}
void GSState::GIFRegHandlerPRMODE(const GIFReg* RESTRICT r)
{
GL_REG("PRMODE = 0x%x_%x", r->U32[1], r->U32[0]);
// We're in PRIM mode, need to ignore any writes
if (m_env.PRMODECONT.AC)
return;
const u32 _PRIM = m_env.PRIM.PRIM;
m_env.PRIM = r->PRMODE;
m_env.PRIM.PRIM = _PRIM;
if (m_prev_env.PRIM.U32[0] ^ m_env.PRIM.U32[0])
m_dirty_gs_regs |= (1 << DIRTY_REG_PRIM);
else
m_dirty_gs_regs &= ~(1 << DIRTY_REG_PRIM);
UpdateContext();
}
void GSState::GIFRegHandlerTEXCLUT(const GIFReg* RESTRICT r)
{
GL_REG("TEXCLUT = 0x%x_%x", r->U32[1], r->U32[0]);
m_env.TEXCLUT = r->TEXCLUT;
}
void GSState::GIFRegHandlerSCANMSK(const GIFReg* RESTRICT r)
{
m_env.SCANMSK = r->SCANMSK;
if (m_env.SCANMSK.MSK & 2)
m_scanmask_used = 2;
if (m_prev_env.SCANMSK.MSK != m_env.SCANMSK.MSK)
m_dirty_gs_regs |= (1 << DIRTY_REG_SCANMSK);
else
m_dirty_gs_regs &= ~(1 << DIRTY_REG_SCANMSK);
}
template <int i>
void GSState::GIFRegHandlerMIPTBP1(const GIFReg* RESTRICT r)
{
GL_REG("MIPTBP1_%d = 0x%x_%x", i, r->U32[1], r->U32[0]);
m_env.CTXT[i].MIPTBP1 = r->MIPTBP1;
if (i == m_prev_env.PRIM.CTXT)
{
if (m_prev_env.CTXT[i].MIPTBP1.U64 != m_env.CTXT[i].MIPTBP1.U64)
m_dirty_gs_regs |= (1 << DIRTY_REG_MIPTBP1);
else
m_dirty_gs_regs &= ~(1 << DIRTY_REG_MIPTBP1);
}
}
template <int i>
void GSState::GIFRegHandlerMIPTBP2(const GIFReg* RESTRICT r)
{
GL_REG("MIPTBP2_%d = 0x%x_%x", i, r->U32[1], r->U32[0]);
m_env.CTXT[i].MIPTBP2 = r->MIPTBP2;
if (i == m_prev_env.PRIM.CTXT)
{
if (m_prev_env.CTXT[i].MIPTBP2.U64 != m_env.CTXT[i].MIPTBP2.U64)
m_dirty_gs_regs |= (1 << DIRTY_REG_MIPTBP2);
else
m_dirty_gs_regs &= ~(1 << DIRTY_REG_MIPTBP2);
}
}
void GSState::GIFRegHandlerTEXA(const GIFReg* RESTRICT r)
{
GL_REG("TEXA = 0x%x_%x", r->U32[1], r->U32[0]);
m_env.TEXA = r->TEXA;
if (m_prev_env.TEXA != m_env.TEXA)
m_dirty_gs_regs |= (1 << DIRTY_REG_TEXA);
else
m_dirty_gs_regs &= ~(1 << DIRTY_REG_TEXA);
}
void GSState::GIFRegHandlerFOGCOL(const GIFReg* RESTRICT r)
{
GL_REG("FOGCOL = 0x%x_%x", r->U32[1], r->U32[0]);
m_env.FOGCOL = r->FOGCOL;
if (m_prev_env.FOGCOL != m_env.FOGCOL)
m_dirty_gs_regs |= (1 << DIRTY_REG_FOGCOL);
else
m_dirty_gs_regs &= ~(1 << DIRTY_REG_FOGCOL);
}
void GSState::GIFRegHandlerTEXFLUSH(const GIFReg* RESTRICT r)
{
GL_REG("TEXFLUSH = 0x%x_%x PRIM TME %x", r->U32[1], r->U32[0], PRIM->TME);
// No need to do a flush if TEX0 has changed
if (!(m_dirty_gs_regs & (1 << DIRTY_REG_TEX0)))
m_texflush_flag = true;
}
template <int i>
void GSState::GIFRegHandlerSCISSOR(const GIFReg* RESTRICT r)
{
if (i == m_prev_env.PRIM.CTXT)
{
if (m_prev_env.CTXT[i].SCISSOR.U64 != r->SCISSOR.U64)
m_dirty_gs_regs |= (1 << DIRTY_REG_SCISSOR);
else
m_dirty_gs_regs &= ~(1 << DIRTY_REG_SCISSOR);
}
if (m_env.CTXT[i].SCISSOR.U64 == r->SCISSOR.U64)
return;
m_env.CTXT[i].SCISSOR = r->SCISSOR;
m_env.CTXT[i].UpdateScissor();
UpdateScissor();
}
template <int i>
void GSState::GIFRegHandlerALPHA(const GIFReg* RESTRICT r)
{
GL_REG("ALPHA = 0x%x_%x", r->U32[1], r->U32[0]);
m_env.CTXT[i].ALPHA = r->ALPHA;
// value of 3 is not allowed by the spec
// acts like 2 on real hw, so just clamp it
m_env.CTXT[i].ALPHA.A = std::clamp<u32>(r->ALPHA.A, 0, 2);
m_env.CTXT[i].ALPHA.B = std::clamp<u32>(r->ALPHA.B, 0, 2);
m_env.CTXT[i].ALPHA.C = std::clamp<u32>(r->ALPHA.C, 0, 2);
m_env.CTXT[i].ALPHA.D = std::clamp<u32>(r->ALPHA.D, 0, 2);
if (i == m_prev_env.PRIM.CTXT)
{
if (m_prev_env.CTXT[i].ALPHA.U64 != m_env.CTXT[i].ALPHA.U64)
m_dirty_gs_regs |= (1 << DIRTY_REG_ALPHA);
else
m_dirty_gs_regs &= ~(1 << DIRTY_REG_ALPHA);
}
}
void GSState::GIFRegHandlerDIMX(const GIFReg* RESTRICT r)
{
m_env.DIMX = r->DIMX;
if (m_prev_env.DIMX != m_env.DIMX)
m_dirty_gs_regs |= (1 << DIRTY_REG_DIMX);
else
m_dirty_gs_regs &= ~(1 << DIRTY_REG_DIMX);
}
void GSState::GIFRegHandlerDTHE(const GIFReg* RESTRICT r)
{
m_env.DTHE = r->DTHE;
if (m_prev_env.DTHE != m_env.DTHE)
m_dirty_gs_regs |= (1 << DIRTY_REG_DTHE);
else
m_dirty_gs_regs &= ~(1 << DIRTY_REG_DTHE);
}
void GSState::GIFRegHandlerCOLCLAMP(const GIFReg* RESTRICT r)
{
m_env.COLCLAMP = r->COLCLAMP;
if (m_prev_env.COLCLAMP != m_env.COLCLAMP)
m_dirty_gs_regs |= (1 << DIRTY_REG_COLCLAMP);
else
m_dirty_gs_regs &= ~(1 << DIRTY_REG_COLCLAMP);
}
template <int i>
void GSState::GIFRegHandlerTEST(const GIFReg* RESTRICT r)
{
m_env.CTXT[i].TEST = r->TEST;
if (i == m_prev_env.PRIM.CTXT)
{
if (m_prev_env.CTXT[i].TEST != m_env.CTXT[i].TEST)
m_dirty_gs_regs |= (1 << DIRTY_REG_TEST);
else
m_dirty_gs_regs &= ~(1 << DIRTY_REG_TEST);
}
}
void GSState::GIFRegHandlerPABE(const GIFReg* RESTRICT r)
{
m_env.PABE = r->PABE;
if (m_prev_env.PABE != m_env.PABE)
m_dirty_gs_regs |= (1 << DIRTY_REG_PABE);
else
m_dirty_gs_regs &= ~(1 << DIRTY_REG_PABE);
}
template <int i>
void GSState::GIFRegHandlerFBA(const GIFReg* RESTRICT r)
{
m_env.CTXT[i].FBA = r->FBA;
if (i == m_prev_env.PRIM.CTXT)
{
if (m_prev_env.CTXT[i].FBA != m_env.CTXT[i].FBA)
m_dirty_gs_regs |= (1 << DIRTY_REG_FBA);
else
m_dirty_gs_regs &= ~(1 << DIRTY_REG_FBA);
}
}
template <int i>
void GSState::GIFRegHandlerFRAME(const GIFReg* RESTRICT r)
{
GL_REG("FRAME_%d = 0x%x_%x", i, r->U32[1], r->U32[0]);
GIFRegFRAME NewFrame = r->FRAME;
// FBW is clamped to 32
NewFrame.FBW = std::min(NewFrame.FBW, 32U);
if ((NewFrame.PSM & 0x30) == 0x30)
m_env.CTXT[i].ZBUF.PSM &= ~0x30;
else
m_env.CTXT[i].ZBUF.PSM |= 0x30;
if ((m_env.CTXT[i].FRAME.U32[0] ^ NewFrame.U32[0]) & 0x3f3f01ff) // FBP FBW PSM
{
m_env.CTXT[i].offset.fb = m_mem.GetOffset(NewFrame.Block(), NewFrame.FBW, NewFrame.PSM);
m_env.CTXT[i].offset.zb = m_mem.GetOffset(m_env.CTXT[i].ZBUF.Block(), NewFrame.FBW, m_env.CTXT[i].ZBUF.PSM);
m_env.CTXT[i].offset.fzb4 = m_mem.GetPixelOffset4(NewFrame, m_env.CTXT[i].ZBUF);
}
m_env.CTXT[i].FRAME = NewFrame;
switch (m_env.CTXT[i].FRAME.PSM)
{
case PSMT8H:
// Berserk uses the format to only update the alpha channel
GL_INS("CORRECT FRAME FORMAT replaces PSMT8H by PSMCT32/0x00FF_FFFF");
m_env.CTXT[i].FRAME.PSM = PSMCT32;
m_env.CTXT[i].FRAME.FBMSK = 0x00FFFFFF;
break;
case PSMT4HH: // Not tested. Based on PSMT8H behavior
GL_INS("CORRECT FRAME FORMAT replaces PSMT4HH by PSMCT32/0x0FFF_FFFF");
m_env.CTXT[i].FRAME.PSM = PSMCT32;
m_env.CTXT[i].FRAME.FBMSK = 0x0FFFFFFF;
break;
case PSMT4HL: // Not tested. Based on PSMT8H behavior
GL_INS("CORRECT FRAME FORMAT replaces PSMT4HL by PSMCT32/0xF0FF_FFFF");
m_env.CTXT[i].FRAME.PSM = PSMCT32;
m_env.CTXT[i].FRAME.FBMSK = 0xF0FFFFFF;
break;
default:
break;
}
if (i == m_prev_env.PRIM.CTXT)
{
if (m_prev_env.CTXT[i].FRAME != m_env.CTXT[i].FRAME)
m_dirty_gs_regs |= (1 << DIRTY_REG_FRAME);
else
m_dirty_gs_regs &= ~(1 << DIRTY_REG_FRAME);
}
}
template <int i>
void GSState::GIFRegHandlerZBUF(const GIFReg* RESTRICT r)
{
GL_REG("ZBUF_%d = 0x%x_%x", i, r->U32[1], r->U32[0]);
GIFRegZBUF ZBUF = r->ZBUF;
// We tested this on the PS2 and it seems to be that when the FRAME is a Z format,
// the Z buffer is forced to use color swizzling.
// Powerdrome relies on this behavior to clear the z buffer by drawing 32 pixel wide strips, skipping 32,
// causing the FRAME to do one strip and the Z to do the other 32 due to the block arrangement.
// Other games listed here also hit this Color/Z swap behaviour without masking Z so could be problematic:
// Black, Driver Parallel Lines, Driv3r, Dropship, DT Racer, Scarface, The Simpsons, THP8
if ((m_env.CTXT[i].FRAME.PSM & 0x30) == 0x30)
ZBUF.PSM &= ~0x30;
else
ZBUF.PSM |= 0x30;
if ((m_env.CTXT[i].ZBUF.U32[0] ^ ZBUF.U32[0]) & 0x3f0001ff) // ZBP PSM
{
m_env.CTXT[i].offset.zb = m_mem.GetOffset(ZBUF.Block(), m_env.CTXT[i].FRAME.FBW, ZBUF.PSM);
m_env.CTXT[i].offset.fzb4 = m_mem.GetPixelOffset4(m_env.CTXT[i].FRAME, ZBUF);
}
m_env.CTXT[i].ZBUF = ZBUF;
if (i == m_prev_env.PRIM.CTXT)
{
if (m_prev_env.CTXT[i].ZBUF != m_env.CTXT[i].ZBUF)
m_dirty_gs_regs |= (1 << DIRTY_REG_ZBUF);
else
m_dirty_gs_regs &= ~(1 << DIRTY_REG_ZBUF);
}
}
void GSState::GIFRegHandlerBITBLTBUF(const GIFReg* RESTRICT r)
{
// TODO: Paletted formats
// There is a memory bug on the GS as it relates to the transfering of
// 8-bit and 4-bit formats needing an even buffer width due to the
// second half of the page being addressed by TBW/2
//
// namcoXcapcom: Apparently uses DBW of 5 and 11 (and refers to them
// in TEX0 later as 4 and 10 respectively). However I can find no
// documentation on this problem, nothing in the game to suggest
// it is broken and the code here for it was likely incorrect to begin with.
GL_REG("BITBLTBUF = 0x%x_%x", r->U32[1], r->U32[0]);
if (r->BITBLTBUF != m_env.BITBLTBUF)
FlushWrite();
m_env.BITBLTBUF = r->BITBLTBUF;
}
void GSState::GIFRegHandlerTRXPOS(const GIFReg* RESTRICT r)
{
GL_REG("TRXPOS = 0x%x_%x", r->U32[1], r->U32[0]);
if (r->TRXPOS != m_env.TRXPOS)
FlushWrite();
m_env.TRXPOS = r->TRXPOS;
}
void GSState::GIFRegHandlerTRXREG(const GIFReg* RESTRICT r)
{
GL_REG("TRXREG = 0x%x_%x", r->U32[1], r->U32[0]);
if (r->TRXREG != m_env.TRXREG)
FlushWrite();
m_env.TRXREG = r->TRXREG;
}
void GSState::GIFRegHandlerTRXDIR(const GIFReg* RESTRICT r)
{
GL_REG("TRXDIR = 0x%x_%x", r->U32[1], r->U32[0]);
FlushWrite();
m_env.TRXDIR = r->TRXDIR;
switch (m_env.TRXDIR.XDIR)
{
case 0: // host -> local
m_tr.Init(m_env.TRXPOS, m_env.TRXREG, m_env.BITBLTBUF, true);
break;
case 1: // local -> host
m_tr.Init(m_env.TRXPOS, m_env.TRXREG, m_env.BITBLTBUF, false);
break;
case 2: // local -> local
CheckWriteOverlap(true, true);
Move();
break;
default: // 3 deactivated as stated by manual. Tested on hardware and no transfers happen.
break;
}
}
void GSState::GIFRegHandlerHWREG(const GIFReg* RESTRICT r)
{
GL_REG("HWREG = 0x%x_%x", r->U32[1], r->U32[0]);
// don't bother if not host -> local
// real hw ignores
if (m_env.TRXDIR.XDIR != 0)
return;
Write(reinterpret_cast<const u8*>(r), 8); // haunting ground
}
void GSState::Flush(GSFlushReason reason)
{
FlushWrite();
if (m_index.tail > 0)
{
// Unless Vsync really needs the pending draw, don't do it when VSync happens as it can really screw up our heuristics when looking ahead.
if (reason == VSYNC)
{
GSDrawingContext* draw_ctx = &m_prev_env.CTXT[m_prev_env.PRIM.CTXT];
const u32 start_bp = GSLocalMemory::GetStartBlockAddress(draw_ctx->FRAME.Block(), draw_ctx->FRAME.FBW, draw_ctx->FRAME.PSM, temp_draw_rect);
const u32 end_bp = GSLocalMemory::GetEndBlockAddress(draw_ctx->FRAME.Block(), draw_ctx->FRAME.FBW, draw_ctx->FRAME.PSM, temp_draw_rect);
bool needs_flush[2] = {PCRTCDisplays.PCRTCDisplays[0].enabled, PCRTCDisplays.PCRTCDisplays[1].enabled};
if (PCRTCDisplays.PCRTCDisplays[1].enabled)
{
const u32 out_start_bp = GSLocalMemory::GetStartBlockAddress(PCRTCDisplays.PCRTCDisplays[1].Block(), PCRTCDisplays.PCRTCDisplays[1].FBW, PCRTCDisplays.PCRTCDisplays[1].PSM, PCRTCDisplays.PCRTCDisplays[1].framebufferRect);
const u32 out_end_bp = GSLocalMemory::GetEndBlockAddress(PCRTCDisplays.PCRTCDisplays[1].Block(), PCRTCDisplays.PCRTCDisplays[1].FBW, PCRTCDisplays.PCRTCDisplays[1].PSM, PCRTCDisplays.PCRTCDisplays[1].framebufferRect);
if (out_start_bp > end_bp || out_end_bp < start_bp)
needs_flush[1] = false;
}
if (PCRTCDisplays.PCRTCDisplays[0].enabled)
{
const u32 out_start_bp = GSLocalMemory::GetStartBlockAddress(PCRTCDisplays.PCRTCDisplays[0].Block(), PCRTCDisplays.PCRTCDisplays[0].FBW, PCRTCDisplays.PCRTCDisplays[0].PSM, PCRTCDisplays.PCRTCDisplays[0].framebufferRect);
const u32 out_end_bp = GSLocalMemory::GetEndBlockAddress(PCRTCDisplays.PCRTCDisplays[0].Block(), PCRTCDisplays.PCRTCDisplays[0].FBW, PCRTCDisplays.PCRTCDisplays[0].PSM, PCRTCDisplays.PCRTCDisplays[0].framebufferRect);
if (out_start_bp > end_bp || out_end_bp < start_bp)
needs_flush[0] = false;
}
if (!needs_flush[0] && !needs_flush[1])
return;
}
m_state_flush_reason = reason;
// Used to prompt the current draw that it's modifying its own CLUT.
CheckCLUTValidity(m_prev_env.PRIM.PRIM);
if (m_dirty_gs_regs)
{
m_draw_env = &m_prev_env;
PRIM = &m_prev_env.PRIM;
UpdateContext();
FlushPrim();
m_draw_env = &m_env;
PRIM = &m_env.PRIM;
UpdateContext();
m_backed_up_ctx = -1;
}
else
{
FlushPrim();
}
m_dirty_gs_regs = 0;
temp_draw_rect = GSVector4i::zero();
}
m_state_flush_reason = GSFlushReason::UNKNOWN;
}
void GSState::FlushWrite()
{
if (!m_tr.write)
return;
const int len = m_tr.end - m_tr.start;
if (len <= 0)
return;
GSVector4i r;
r = m_tr.rect;
InvalidateVideoMem(m_env.BITBLTBUF, r);
const GSLocalMemory::writeImage wi = GSLocalMemory::m_psm[m_env.BITBLTBUF.DPSM].wi;
wi(m_mem, m_tr.x, m_tr.y, &m_tr.buff[m_tr.start], len, m_tr.m_blit, m_tr.m_pos, m_tr.m_reg);
m_tr.start += len;
g_perfmon.Put(GSPerfMon::Swizzle, len);
s_transfer_n++;
if (m_tr.start >= m_tr.total)
m_env.TRXDIR.XDIR = 3;
}
// This function decides if the context has changed in a way which warrants flushing the draw.
inline bool GSState::TestDrawChanged()
{
// Check if PRIM has changed we need to check if it's just a different triangle or the context is changing.
if (m_dirty_gs_regs & (1 << DIRTY_REG_PRIM))
{
u32 prim_mask = 0x7ff;
if (GSUtil::GetPrimClass(m_prev_env.PRIM.PRIM) == GSUtil::GetPrimClass(m_env.PRIM.PRIM))
prim_mask &= ~0x7;
else
return true;
if ((m_env.PRIM.U32[0] ^ m_prev_env.PRIM.U32[0]) & prim_mask)
return true;
m_dirty_gs_regs &= ~(1 << DIRTY_REG_PRIM);
// Shortcut, a bunch of games just change the prim reg
if (!m_dirty_gs_regs)
return false;
}
if ((m_dirty_gs_regs & ((1 << DIRTY_REG_TEST) | (1 << DIRTY_REG_SCISSOR) | (1 << DIRTY_REG_XYOFFSET) | (1 << DIRTY_REG_SCANMSK) | (1 << DIRTY_REG_DTHE))) || ((m_dirty_gs_regs & (1 << DIRTY_REG_DIMX)) && m_prev_env.DTHE.DTHE))
return true;
if (m_prev_env.PRIM.ABE && (m_dirty_gs_regs & ((1 << DIRTY_REG_ALPHA) | (1 << DIRTY_REG_PABE))))
return true;
if (m_prev_env.PRIM.FGE && (m_dirty_gs_regs & (1 << DIRTY_REG_FOGCOL)))
return true;
const int context = m_prev_env.PRIM.CTXT;
const GSDrawingContext& ctx = m_prev_env.CTXT[context];
// If the frame is getting updated check the FRAME, otherwise, we can ignore it
if ((ctx.TEST.ATST != ATST_NEVER) || !ctx.TEST.ATE || (ctx.TEST.AFAIL & 1) || ctx.TEST.DATE)
{
if ((m_dirty_gs_regs & ((1 << DIRTY_REG_FRAME) | (1 << DIRTY_REG_COLCLAMP) | (1 << DIRTY_REG_FBA))))
return true;
}
if ((ctx.TEST.ATST != ATST_NEVER) || !ctx.TEST.ATE || ctx.TEST.AFAIL == AFAIL_ZB_ONLY)
{
if (m_dirty_gs_regs & (1 << DIRTY_REG_ZBUF))
return true;
}
if (m_prev_env.PRIM.TME)
{
if (m_dirty_gs_regs & ((1 << DIRTY_REG_TEX0) | (1 << DIRTY_REG_TEX1) | (1 << DIRTY_REG_CLAMP) | (1 << DIRTY_REG_TEXA)))
return true;
if(ctx.TEX1.MXL > 0 && (m_dirty_gs_regs & ((1 << DIRTY_REG_MIPTBP1) | (1 << DIRTY_REG_MIPTBP2))))
return true;
}
m_dirty_gs_regs = 0;
return false;
}
u32 GSState::CalcMask(int exp, int max_exp)
{
const int amount = 9 + (max_exp - exp);
return (1 << std::min(amount, 23)) - 1;
}
void GSState::FlushPrim()
{
if (m_index.tail > 0)
{
GL_REG("FlushPrim ctxt %d", PRIM->CTXT);
// clear texture cache flushed flag, since we're reading from it
m_texflush_flag = PRIM->TME ? false : m_texflush_flag;
// internal frame rate detection based on sprite blits to the display framebuffer
{
const u32 FRAME_FBP = m_context->FRAME.FBP;
if ((m_regs->DISP[0].DISPFB.FBP == FRAME_FBP && m_regs->PMODE.EN1) ||
(m_regs->DISP[1].DISPFB.FBP == FRAME_FBP && m_regs->PMODE.EN2))
{
g_perfmon.AddDisplayFramebufferSpriteBlit();
}
}
GSVertex buff[2];
s_n++;
const u32 head = m_vertex.head;
const u32 tail = m_vertex.tail;
const u32 next = m_vertex.next;
u32 unused = 0;
if (tail > head)
{
switch (PRIM->PRIM)
{
case GS_POINTLIST:
pxAssert(0);
break;
case GS_LINELIST:
case GS_LINESTRIP:
case GS_SPRITE:
unused = 1;
buff[0] = m_vertex.buff[tail - 1];
break;
case GS_TRIANGLELIST:
case GS_TRIANGLESTRIP:
unused = std::min<u32>(tail - head, 2);
memcpy(buff, &m_vertex.buff[tail - unused], sizeof(GSVertex) * 2);
break;
case GS_TRIANGLEFAN:
buff[0] = m_vertex.buff[head];
unused = 1;
if (tail - 1 > head)
{
buff[1] = m_vertex.buff[tail - 1];
unused = 2;
}
break;
case GS_INVALID:
break;
default:
ASSUME(0);
}
pxAssert((int)unused < GSUtil::GetVertexCount(PRIM->PRIM));
}
// If the PSM format of Z is invalid, but it is masked (no write) and ZTST is set to ALWAYS pass (no test, just allow)
// we can ignore the Z format, since it won't be used in the draw (Star Ocean 3 transitions)
#ifdef PCSX2_DEVBUILD
const bool ignoreZ = m_context->ZBUF.ZMSK && m_context->TEST.ZTST == 1;
if (GSLocalMemory::m_psm[m_context->FRAME.PSM].fmt >= 3 || (GSLocalMemory::m_psm[m_context->ZBUF.PSM].fmt >= 3 && !ignoreZ))
{
Console.Warning("GS: Possible invalid draw, Frame PSM %x ZPSM %x", m_context->FRAME.PSM, m_context->ZBUF.PSM);
}
#endif
// Update scissor, it may have been modified by a previous draw
m_env.CTXT[PRIM->CTXT].UpdateScissor();
m_vt.Update(m_vertex.buff, m_index.buff, m_vertex.tail, m_index.tail, GSUtil::GetPrimClass(PRIM->PRIM));
// Texel coordinate rounding
// Helps Manhunt (lights shining through objects).
// Can help with some alignment issues when upscaling too, and is for both Software and Hardware renderers.
// Sometimes hardware doesn't get affected, likely due to the difference in how GPU's handle textures (Persona minimap).
if (PRIM->TME && (GSUtil::GetPrimClass(PRIM->PRIM) == GS_PRIM_CLASS::GS_SPRITE_CLASS || m_vt.m_eq.z))
{
if (!PRIM->FST) // STQ's
{
const bool is_sprite = GSUtil::GetPrimClass(PRIM->PRIM) == GS_PRIM_CLASS::GS_SPRITE_CLASS;
// ST's have the lowest 9 bits (or greater depending on exponent difference) rounding down (from hardware tests).
for (int i = m_index.tail - 1; i >= 0; i--)
{
GSVertex* v = &m_vertex.buff[m_index.buff[i]];
// Only Q on the second vertex is valid
if (!(i & 1) && is_sprite)
v->RGBAQ.Q = m_vertex.buff[m_index.buff[i + 1]].RGBAQ.Q;
int T = std::bit_cast<int>(v->ST.T);
int Q = std::bit_cast<int>(v->RGBAQ.Q);
int S = std::bit_cast<int>(v->ST.S);
const int expS = (S >> 23) & 0xff;
const int expT = (T >> 23) & 0xff;
const int expQ = (Q >> 23) & 0xff;
int max_exp = std::max(expS, expQ);
u32 mask = CalcMask(expS, max_exp);
S &= ~mask;
v->ST.S = std::bit_cast<float>(S);
max_exp = std::max(expT, expQ);
mask = CalcMask(expT, max_exp);
T &= ~mask;
v->ST.T = std::bit_cast<float>(T);
Q &= ~0xff;
if (!is_sprite || (i & 1))
v->RGBAQ.Q = std::bit_cast<float>(Q);
m_vt.m_min.t.x = std::min(m_vt.m_min.t.x, (v->ST.S / v->RGBAQ.Q) * (1 << m_context->TEX0.TW));
m_vt.m_min.t.y = std::min(m_vt.m_min.t.y, (v->ST.T / v->RGBAQ.Q) * (1 << m_context->TEX0.TH));
}
}
}
// Skip draw if Z test is enabled, but set to fail all pixels.
const bool skip_draw = (m_context->TEST.ZTE && m_context->TEST.ZTST == ZTST_NEVER);
m_quad_check_valid = false;
if (!skip_draw)
Draw();
g_perfmon.Put(GSPerfMon::Draw, 1);
g_perfmon.Put(GSPerfMon::Prim, m_index.tail / GSUtil::GetVertexCount(PRIM->PRIM));
m_index.tail = 0;
m_vertex.head = 0;
if (unused > 0)
{
memcpy(m_vertex.buff, buff, sizeof(GSVertex) * unused);
m_vertex.tail = unused;
m_vertex.next = next > head ? next - head : 0;
// If it's a Triangle fan the XY buffer needs to be updated to point to the correct head vert
// Jak 3 shadows get spikey (with autoflush) if you don't.
if (PRIM->PRIM == GS_TRIANGLEFAN)
{
for (u32 i = 0; i < unused; i++)
{
GSVector4i* RESTRICT vert_ptr = (GSVector4i*)&m_vertex.buff[i];
GSVector4i v = vert_ptr[1];
v = v.xxxx().u16to32().sub32(m_xyof);
v = v.blend32<12>(v.sra32<4>());
m_vertex.xy[i & 3] = v;
m_vertex.xy_tail = unused;
}
}
}
else
{
m_vertex.tail = 0;
m_vertex.next = 0;
}
}
}
GSVector4i GSState::GetTEX0Rect()
{
GSVector4i ret = GSVector4i::zero();
const GSDrawingContext& prev_ctx = m_prev_env.CTXT[m_prev_env.PRIM.CTXT];
if (prev_ctx.CLAMP.WMS <= 1) // CLAMP/REPEAT
{
ret.x = 0;
ret.z = 1 << prev_ctx.TEX0.TW;
}
else if (prev_ctx.CLAMP.WMS == 3) // REGION_REPEAT
{
ret.x = prev_ctx.CLAMP.MAXU;
ret.z = prev_ctx.CLAMP.MAXU | prev_ctx.CLAMP.MINU;
}
else // 2 REGION_CLAMP
{
ret.x = prev_ctx.CLAMP.MINU;
ret.z = prev_ctx.CLAMP.MAXU;
}
if (prev_ctx.CLAMP.WMT <= 1) // CLAMP/REPEAT
{
ret.y = 0;
ret.w = 1 << prev_ctx.TEX0.TH;
}
else if (prev_ctx.CLAMP.WMT == 3) // REGION_REPEAT
{
ret.y = prev_ctx.CLAMP.MAXV;
ret.w = prev_ctx.CLAMP.MAXV | prev_ctx.CLAMP.MINV;
}
else // 2 REGION_CLAMP
{
ret.y = prev_ctx.CLAMP.MINV;
ret.w = prev_ctx.CLAMP.MAXV;
}
return ret;
}
void GSState::CheckWriteOverlap(bool req_write, bool req_read)
{
const int w = m_env.TRXREG.RRW;
const int h = m_env.TRXREG.RRH;
const GIFRegBITBLTBUF& blit = m_env.BITBLTBUF;
const GSDrawingContext& prev_ctx = m_prev_env.CTXT[m_prev_env.PRIM.CTXT];
const GSVector4i write_rect = GSVector4i(m_env.TRXPOS.DSAX, m_env.TRXPOS.DSAY, m_env.TRXPOS.DSAX + w, m_env.TRXPOS.DSAY + h);
const u32 write_start_bp = GSLocalMemory::GetStartBlockAddress(blit.DBP, blit.DBW, blit.DPSM, write_rect);
const u32 write_end_bp = ((GSLocalMemory::GetEndBlockAddress(blit.DBP, blit.DBW, blit.DPSM, write_rect) + 1) + (GS_BLOCKS_PER_PAGE - 1)) & ~(GS_BLOCKS_PER_PAGE - 1);
GSVector4i tex_rect = m_prev_env.PRIM.TME ? GetTEX0Rect() : GSVector4i::zero();
if (m_index.tail > 0)
{
// Only flush on a NEW transfer if a pending one is using the same address or overlap.
// Check Fast & Furious (Hardare mode) and Assault Suits Valken (either renderer) and Tomb Raider - Angel of Darkness menu (TBP != DBP but overlaps).
// Cartoon Network overwrites its own Z buffer in the middle of a draw.
// Alias wraps its transfers, so be careful
const GSVector4i read_rect = GSVector4i(m_env.TRXPOS.SSAX, m_env.TRXPOS.SSAY, m_env.TRXPOS.SSAX + w, m_env.TRXPOS.SSAY + h);
if (req_write && m_prev_env.PRIM.TME)
{
// Tex rect could be invalid showing 1024x1024 when it isn't. If the frame is only 1 page wide, it's either a big strip or a single page draw.
// This large texture causes misdetection of overlapping writes, causing our heuristics in the hardware renderer for future draws to be missing.
// Either way if we check the queued up coordinates, it should give us a fair idea. (Cabela's Trophy Bucks)
if (prev_ctx.FRAME.FBW == 1 && tex_rect.width() > (prev_ctx.TEX0.TBW * 64))
{
GSVector4i tex_draw_rect = GSVector4i::zero();
for (u32 i = 0; i < m_index.tail; i++)
{
const GSVertex* v = &m_vertex.buff[m_index.buff[i]];
GSVector2i tex_coord;
if (PRIM->FST)
{
tex_coord.x = v->U >> 4;
tex_coord.y = v->V >> 4;
}
else
{
const float s = std::min((v->ST.S / v->RGBAQ.Q), 1.0f);
const float t = std::min((v->ST.T / v->RGBAQ.Q), 1.0f);
tex_coord.x = static_cast<int>(std::round((1 << m_context->TEX0.TW) * s));
tex_coord.y = static_cast<int>(std::round((1 << m_context->TEX0.TH) * t));
}
if (i == 0)
{
tex_draw_rect.x = tex_coord.x;
tex_draw_rect.y = tex_coord.y;
tex_draw_rect.z = tex_coord.x;
tex_draw_rect.w = tex_coord.y;
continue;
}
tex_draw_rect.x = std::min(tex_draw_rect.x, tex_coord.x);
tex_draw_rect.z = std::max(tex_draw_rect.z, tex_coord.x);
tex_draw_rect.y = std::min(tex_draw_rect.y, tex_coord.y);
tex_draw_rect.w = std::max(tex_draw_rect.w, tex_coord.y);
}
tex_rect = tex_rect.rintersect(tex_draw_rect);
}
if (GSLocalMemory::HasOverlap(blit.DBP, blit.DBW, blit.DPSM, write_rect, prev_ctx.TEX0.TBP0, prev_ctx.TEX0.TBW, prev_ctx.TEX0.PSM, tex_rect))
{
Flush(GSFlushReason::UPLOADDIRTYTEX);
}
if (prev_ctx.TEX1.MXL > 0 && prev_ctx.TEX1.MMIN >= 2 && prev_ctx.TEX1.MMIN <= 5)
{
switch (prev_ctx.TEX1.MXL)
{
case 6:
if (GSLocalMemory::HasOverlap(blit.DBP, blit.DBW, blit.DPSM, write_rect, prev_ctx.MIPTBP2.TBP6, prev_ctx.MIPTBP2.TBW6, prev_ctx.TEX0.PSM, GSVector4i(tex_rect.x >> 6, tex_rect.y >> 6, tex_rect.z >> 6, tex_rect.w >> 6)))
Flush(GSFlushReason::UPLOADDIRTYTEX);
[[fallthrough]];
case 5:
if (GSLocalMemory::HasOverlap(blit.DBP, blit.DBW, blit.DPSM, write_rect, prev_ctx.MIPTBP2.TBP5, prev_ctx.MIPTBP2.TBW5, prev_ctx.TEX0.PSM, GSVector4i(tex_rect.x >> 5, tex_rect.y >> 5, tex_rect.z >> 5, tex_rect.w >> 5)))
Flush(GSFlushReason::UPLOADDIRTYTEX);
[[fallthrough]];
case 4:
if (GSLocalMemory::HasOverlap(blit.DBP, blit.DBW, blit.DPSM, write_rect, prev_ctx.MIPTBP2.TBP4, prev_ctx.MIPTBP2.TBW4, prev_ctx.TEX0.PSM, GSVector4i(tex_rect.x >> 4, tex_rect.y >> 4, tex_rect.z >> 4, tex_rect.w >> 4)))
Flush(GSFlushReason::UPLOADDIRTYTEX);
[[fallthrough]];
case 3:
if (GSLocalMemory::HasOverlap(blit.DBP, blit.DBW, blit.DPSM, write_rect, prev_ctx.MIPTBP1.TBP3, prev_ctx.MIPTBP1.TBW3, prev_ctx.TEX0.PSM, GSVector4i(tex_rect.x >> 3, tex_rect.y >> 3, tex_rect.z >> 3, tex_rect.w >> 3)))
Flush(GSFlushReason::UPLOADDIRTYTEX);
[[fallthrough]];
case 2:
if (GSLocalMemory::HasOverlap(blit.DBP, blit.DBW, blit.DPSM, write_rect, prev_ctx.MIPTBP1.TBP2, prev_ctx.MIPTBP1.TBW2, prev_ctx.TEX0.PSM, GSVector4i(tex_rect.x >> 2, tex_rect.y >> 2, tex_rect.z >> 2, tex_rect.w >> 2)))
Flush(GSFlushReason::UPLOADDIRTYTEX);
[[fallthrough]];
case 1:
if (GSLocalMemory::HasOverlap(blit.DBP, blit.DBW, blit.DPSM, write_rect, prev_ctx.MIPTBP1.TBP1, prev_ctx.MIPTBP1.TBW1, prev_ctx.TEX0.PSM, GSVector4i(tex_rect.x >> 1, tex_rect.y >> 1, tex_rect.z >> 1, tex_rect.w >> 1)))
Flush(GSFlushReason::UPLOADDIRTYTEX);
break;
}
}
}
const u32 frame_mask = GSLocalMemory::m_psm[prev_ctx.FRAME.PSM].fmsk;
const bool frame_required = (!(prev_ctx.TEST.ATE && prev_ctx.TEST.ATST == 0 && (prev_ctx.TEST.AFAIL == 2 || prev_ctx.TEST.AFAIL == 0)) && ((prev_ctx.FRAME.FBMSK & frame_mask) != frame_mask)) || prev_ctx.TEST.DATE;
if (frame_required)
{
const GSFlushReason reason = req_write ? (req_read ? GSFlushReason::LOCALTOLOCALMOVE : GSFlushReason::UPLOADDIRTYFRAME) : GSFlushReason::DOWNLOADFIFO;
if ((req_write && (blit.DBP == prev_ctx.FRAME.Block() || GSLocalMemory::HasOverlap(blit.DBP, blit.DBW, blit.DPSM, write_rect, prev_ctx.FRAME.Block(), prev_ctx.FRAME.FBW, prev_ctx.FRAME.PSM, temp_draw_rect))) ||
(req_read && (blit.DBP == prev_ctx.FRAME.Block() || GSLocalMemory::HasOverlap(blit.SBP, blit.SBW, blit.SPSM, read_rect, prev_ctx.FRAME.Block(), prev_ctx.FRAME.FBW, prev_ctx.FRAME.PSM, temp_draw_rect))))
Flush(reason);
}
const bool zbuf_required = (!(prev_ctx.TEST.ATE && prev_ctx.TEST.ATST == 0 && prev_ctx.TEST.AFAIL != 2) && !prev_ctx.ZBUF.ZMSK) || (prev_ctx.TEST.ZTE && prev_ctx.TEST.ZTST > ZTST_ALWAYS);
if (zbuf_required)
{
const GSFlushReason reason = req_write ? (req_read ? GSFlushReason::LOCALTOLOCALMOVE : GSFlushReason::UPLOADDIRTYZBUF) : GSFlushReason::DOWNLOADFIFO;
if ((req_write && (blit.DBP == prev_ctx.ZBUF.Block() || GSLocalMemory::HasOverlap(blit.DBP, blit.DBW, blit.DPSM, write_rect, prev_ctx.ZBUF.Block(), prev_ctx.FRAME.FBW, prev_ctx.ZBUF.PSM, temp_draw_rect))) ||
(req_read && (blit.DBP == prev_ctx.ZBUF.Block() || GSLocalMemory::HasOverlap(blit.SBP, blit.SBW, blit.SPSM, read_rect, prev_ctx.ZBUF.Block(), prev_ctx.FRAME.FBW, prev_ctx.ZBUF.PSM, temp_draw_rect))))
Flush(reason);
}
}
if (req_write)
{
// Invalid the CLUT if it crosses paths.
m_mem.m_clut.InvalidateRange(write_start_bp, write_end_bp);
}
}
void GSState::Write(const u8* mem, int len)
{
if (m_env.TRXDIR.XDIR == 3)
return;
CheckWriteOverlap(true, false);
if (!m_tr.Update(m_tr.w, m_tr.h, GSLocalMemory::m_psm[m_tr.m_blit.DPSM].trbpp, len))
{
m_env.TRXDIR.XDIR = 3;
return;
}
GIFRegBITBLTBUF& blit = m_tr.m_blit;
const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[blit.DPSM];
if (m_tr.end == 0)
{
GSVector4i r;
r = m_tr.rect;
s_last_transfer_draw_n = s_n;
// Store the transfer for preloading new RT's.
if ((m_draw_transfers.size() > 0 && blit.DBP == m_draw_transfers.back().blit.DBP))
{
// Same BP, let's update the rect.
GSUploadQueue transfer = m_draw_transfers.back();
m_draw_transfers.pop_back();
transfer.rect = transfer.rect.runion(r);
transfer.draw = s_n;
transfer.zero_clear = false;
m_draw_transfers.push_back(transfer);
}
else
{
GSUploadQueue new_transfer = { blit, r, s_n, false };
m_draw_transfers.push_back(new_transfer);
}
GL_CACHE("Write! %u ... => 0x%x W:%d F:%s (DIR %d%d), dPos(%d %d) size(%d %d) draw %d", s_transfer_n,
blit.DBP, blit.DBW, GSUtil::GetPSMName(blit.DPSM),
m_tr.m_pos.DIRX, m_tr.m_pos.DIRY,
m_tr.x, m_tr.y, m_tr.w, m_tr.h, s_n);
if (len >= m_tr.total)
{
// received all data in one piece, no need to buffer it
InvalidateVideoMem(blit, r);
psm.wi(m_mem, m_tr.x, m_tr.y, mem, m_tr.total, blit, m_tr.m_pos, m_tr.m_reg);
m_tr.start = m_tr.end = m_tr.total;
g_perfmon.Put(GSPerfMon::Swizzle, len);
s_transfer_n++;
m_env.TRXDIR.XDIR = 3;
return;
}
}
memcpy(&m_tr.buff[m_tr.end], mem, len);
m_tr.end += len;
if (m_tr.end >= m_tr.total)
FlushWrite();
}
void GSState::InitReadFIFO(u8* mem, int len)
{
// No size or already a transfer in progress.
if (len <= 0 || m_tr.total != 0)
return;
if (m_env.TRXDIR.XDIR == 3)
return;
const int w = m_env.TRXREG.RRW;
const int h = m_env.TRXREG.RRH;
const u16 bpp = GSLocalMemory::m_psm[m_env.BITBLTBUF.SPSM].trbpp;
CheckWriteOverlap(false, true);
if (!m_tr.Update(w, h, bpp, len))
return;
const int sx = m_env.TRXPOS.SSAX;
const int sy = m_env.TRXPOS.SSAY;
const GSVector4i r(sx, sy, sx + w, sy + h);
if (m_tr.x == sx && m_tr.y == sy)
InvalidateLocalMem(m_env.BITBLTBUF, r);
// Read the image all in one go.
m_mem.ReadImageX(m_tr.x, m_tr.y, m_tr.buff, m_tr.total, m_env.BITBLTBUF, m_env.TRXPOS, m_env.TRXREG);
if (GSConfig.SaveRT && GSConfig.ShouldDump(s_n, g_perfmon.GetFrame()))
{
const std::string s(GetDrawDumpPath(
"%05d_read_%05x_%d_%d_%d_%d_%d_%d.bmp",
s_n, (int)m_env.BITBLTBUF.SBP, (int)m_env.BITBLTBUF.SBW, (int)m_env.BITBLTBUF.SPSM,
r.left, r.top, r.right, r.bottom));
m_mem.SaveBMP(s, m_env.BITBLTBUF.SBP, m_env.BITBLTBUF.SBW, m_env.BITBLTBUF.SPSM, r.right, r.bottom);
}
}
// NOTE: called from outside MTGS
void GSState::Read(u8* mem, int len)
{
if (len <= 0 || m_tr.total == 0)
return;
if (m_env.TRXDIR.XDIR == 3)
return;
const int w = m_env.TRXREG.RRW;
const int h = m_env.TRXREG.RRH;
const u16 bpp = GSLocalMemory::m_psm[m_env.BITBLTBUF.SPSM].trbpp;
CheckWriteOverlap(false, true);
if (!m_tr.Update(w, h, bpp, len))
return;
const int draw = s_n;
if (draw != s_n)
DevCon.Warning("Warning! Possible incorrect data download");
// If it wraps memory, we need to break it up so we don't read out of bounds.
if ((m_tr.end + len) > m_mem.m_vmsize)
{
const int first_transfer = m_mem.m_vmsize - m_tr.end;
const int second_transfer = len - first_transfer;
memcpy(mem, &m_tr.buff[m_tr.end], first_transfer);
m_tr.end = 0;
memcpy(&mem[first_transfer], &m_tr.buff, second_transfer);
m_tr.end = second_transfer;
}
else
{
memcpy(mem, &m_tr.buff[m_tr.end], len);
m_tr.end += len;
}
if(m_tr.end >= m_tr.total)
m_env.TRXDIR.XDIR = 3;
}
void GSState::Move()
{
// ffxii uses this to move the top/bottom of the scrolling menus offscreen and then blends them back over the text to create a shading effect
// guitar hero copies the far end of the board to do a similar blend too
s_transfer_n++;
if (m_env.TRXDIR.XDIR == 3)
return;
int sx = m_env.TRXPOS.SSAX;
int sy = m_env.TRXPOS.SSAY;
int dx = m_env.TRXPOS.DSAX;
int dy = m_env.TRXPOS.DSAY;
const int w = m_env.TRXREG.RRW;
const int h = m_env.TRXREG.RRH;
GL_CACHE("Move! 0x%x W:%d F:%s => 0x%x W:%d F:%s (DIR %d%d), sPos(%d %d) dPos(%d %d) size(%d %d)",
m_env.BITBLTBUF.SBP, m_env.BITBLTBUF.SBW, GSUtil::GetPSMName(m_env.BITBLTBUF.SPSM),
m_env.BITBLTBUF.DBP, m_env.BITBLTBUF.DBW, GSUtil::GetPSMName(m_env.BITBLTBUF.DPSM),
m_env.TRXPOS.DIRX, m_env.TRXPOS.DIRY,
sx, sy, dx, dy, w, h);
InvalidateLocalMem(m_env.BITBLTBUF, GSVector4i(sx, sy, sx + w, sy + h));
InvalidateVideoMem(m_env.BITBLTBUF, GSVector4i(dx, dy, dx + w, dy + h));
int xinc = 1;
int yinc = 1;
if (m_env.TRXPOS.DIRX)
{
sx += w - 1;
dx += w - 1;
xinc = -1;
}
if (m_env.TRXPOS.DIRY)
{
sy += h - 1;
dy += h - 1;
yinc = -1;
}
const GSLocalMemory::psm_t& spsm = GSLocalMemory::m_psm[m_env.BITBLTBUF.SPSM];
const GSLocalMemory::psm_t& dpsm = GSLocalMemory::m_psm[m_env.BITBLTBUF.DPSM];
// TODO: unroll inner loops (width has special size requirement, must be multiples of 1 << n, depending on the format)
const int sbp = m_env.BITBLTBUF.SBP;
const int sbw = m_env.BITBLTBUF.SBW;
const int dbp = m_env.BITBLTBUF.DBP;
const int dbw = m_env.BITBLTBUF.DBW;
const GSOffset spo = m_mem.GetOffset(sbp, sbw, m_env.BITBLTBUF.SPSM);
const GSOffset dpo = m_mem.GetOffset(dbp, dbw, m_env.BITBLTBUF.DPSM);
GSVector4i r;
r.left = m_env.TRXPOS.DSAX;
r.top = m_env.TRXPOS.DSAY;
r.right = r.left + m_env.TRXREG.RRW;
r.bottom = r.top + m_env.TRXREG.RRH;
s_last_transfer_draw_n = s_n;
// Store the transfer for preloading new RT's.
if ((m_draw_transfers.size() > 0 && m_env.BITBLTBUF.DBP == m_draw_transfers.back().blit.DBP))
{
// Same BP, let's update the rect.
GSUploadQueue transfer = m_draw_transfers.back();
m_draw_transfers.pop_back();
transfer.rect = transfer.rect.runion(r);
transfer.draw = s_n;
transfer.zero_clear = false;
m_draw_transfers.push_back(transfer);
}
else
{
GSUploadQueue new_transfer = { m_env.BITBLTBUF, r, s_n, false };
m_draw_transfers.push_back(new_transfer);
}
auto copy = [this, sbp, dbp, sx, sy, dx, dy, w, h, yinc, xinc](const GSOffset& dpo, const GSOffset& spo, auto&& pxCopyFn)
{
int _sy = sy, _dy = dy; // Faster with local copied variables, compiler optimizations are dumb
if (xinc > 0)
{
const int page_width = GSLocalMemory::m_psm[m_env.BITBLTBUF.DPSM].pgs.x;
const int page_height = GSLocalMemory::m_psm[m_env.BITBLTBUF.DPSM].pgs.y;
const int xpage = sx & ~(page_width - 1);
const int ypage = _sy & ~(page_height - 1);
// Copying from itself to itself (rotating textures) used in Gitaroo Man stage 8
// What probably happens is because the copy is buffered, the source stays just ahead of the destination.
// No need to do all this if the copy source/destination don't intersect, however.
const bool intersect = !(GSVector4i(sx, sy, sx + w, sy + h).rintersect(GSVector4i(dx, dy, dx + w, dy + h)).rempty());
if (intersect && sbp == dbp && (((_sy < _dy) && ((ypage + page_height) > _dy)) || ((sx < dx) && ((xpage + page_width) > dx))))
{
int starty = (yinc > 0) ? 0 : h-1;
int endy = (yinc > 0) ? h : -1;
int y_inc = yinc;
if (((_sy < _dy) && ((ypage + page_height) > _dy)) && yinc > 0)
{
_sy += h-1;
_dy += h-1;
starty = h-1;
endy = -1;
y_inc = -y_inc;
}
for (int y = starty; y != endy; y+= y_inc, _sy += y_inc, _dy += y_inc)
{
GSOffset::PAHelper s = spo.paMulti(0, _sy);
GSOffset::PAHelper d = dpo.paMulti(0, _dy);
if (((sx < dx) && ((xpage + page_width) > dx)))
{
for (int x = w - 1; x >= 0; x--)
{
pxCopyFn(d.value((dx + x) & 2047), s.value((sx + x) & 2047));
}
}
else
{
for (int x = 0; x < w; x++)
{
pxCopyFn(d.value((dx + x) & 2047), s.value((sx + x) & 2047));
}
}
}
}
else
{
for (int y = 0; y < h; y++, _sy += yinc, _dy += yinc)
{
GSOffset::PAHelper s = spo.paMulti(0, _sy);
GSOffset::PAHelper d = dpo.paMulti(0, _dy);
for (int x = 0; x < w; x++)
{
pxCopyFn(d.value((dx + x) & 2047), s.value((sx + x) & 2047));
}
}
}
}
else
{
for (int y = 0; y < h; y++, _sy += yinc, _dy += yinc)
{
GSOffset::PAHelper s = spo.paMulti(0, _sy);
GSOffset::PAHelper d = dpo.paMulti(0, _dy);
for (int x = 0; x < w; x++)
{
pxCopyFn(d.value((dx - x) & 2047), s.value((sx - x) & 2047));
}
}
}
};
if (spsm.trbpp == dpsm.trbpp && spsm.trbpp >= 16)
{
if (spsm.trbpp == 32)
{
u32* vm = m_mem.vm32();
copy(dpo.assertSizesMatch(GSLocalMemory::swizzle32), spo.assertSizesMatch(GSLocalMemory::swizzle32), [vm](u32 doff, u32 soff)
{
vm[doff] = vm[soff];
});
}
else if (spsm.trbpp == 24)
{
u32* vm = m_mem.vm32();
copy(dpo.assertSizesMatch(GSLocalMemory::swizzle32), spo.assertSizesMatch(GSLocalMemory::swizzle32), [vm](u32 doff, u32 soff)
{
vm[doff] = (vm[doff] & 0xff000000) | (vm[soff] & 0x00ffffff);
});
}
else // if (spsm.trbpp == 16)
{
u16* vm = m_mem.vm16();
copy(dpo.assertSizesMatch(GSLocalMemory::swizzle16), spo.assertSizesMatch(GSLocalMemory::swizzle16), [vm](u32 doff, u32 soff)
{
vm[doff] = vm[soff];
});
}
}
else if (m_env.BITBLTBUF.SPSM == PSMT8 && m_env.BITBLTBUF.DPSM == PSMT8)
{
u8* vm = m_mem.m_vm8;
copy(GSOffset::fromKnownPSM(dbp, dbw, PSMT8), GSOffset::fromKnownPSM(sbp, sbw, PSMT8), [vm](u32 doff, u32 soff)
{
vm[doff] = vm[soff];
});
}
else if (m_env.BITBLTBUF.SPSM == PSMT4 && m_env.BITBLTBUF.DPSM == PSMT4)
{
copy(GSOffset::fromKnownPSM(dbp, dbw, PSMT4), GSOffset::fromKnownPSM(sbp, sbw, PSMT4), [&](u32 doff, u32 soff)
{
m_mem.WritePixel4(doff, m_mem.ReadPixel4(soff));
});
}
else
{
copy(dpo, spo, [&](u32 doff, u32 soff)
{
(m_mem.*dpsm.wpa)(doff, (m_mem.*spsm.rpa)(soff));
});
}
m_env.TRXDIR.XDIR = 3;
}
void GSState::SoftReset(u32 mask)
{
if (mask & 1)
{
memset(&m_path[0], 0, sizeof(GIFPath));
memset(&m_path[3], 0, sizeof(GIFPath));
}
if (mask & 2)
memset(&m_path[1], 0, sizeof(GIFPath));
if (mask & 4)
memset(&m_path[2], 0, sizeof(GIFPath));
m_env.TRXDIR.XDIR = 3; //-1 ; set it to invalid value
m_q = 1.0f;
}
void GSState::ReadFIFO(u8* mem, int size)
{
size *= 16;
Read(mem, size);
if (m_dump)
m_dump->ReadFIFO(size / 16);
}
void GSState::ReadLocalMemoryUnsync(u8* mem, int qwc, GIFRegBITBLTBUF BITBLTBUF, GIFRegTRXPOS TRXPOS, GIFRegTRXREG TRXREG)
{
const int w = TRXREG.RRW;
const int h = TRXREG.RRH;
const u16 bpp = GSLocalMemory::m_psm[BITBLTBUF.SPSM].trbpp;
GSTransferBuffer tb;
if(m_tr.end >= m_tr.total || m_tr.write == true)
tb.Init(TRXPOS, TRXREG, BITBLTBUF, false);
int len = qwc * 16;
if (!tb.Update(w, h, bpp, len))
return;
if (m_tr.start == 0)
{
m_mem.ReadImageX(tb.x, tb.y, m_tr.buff, m_tr.total, BITBLTBUF, TRXPOS, TRXREG);
m_tr.start += m_tr.total;
}
if ((m_tr.end + len) > m_mem.m_vmsize)
{
const int masked_end = m_tr.end & 0x3FFFFF; // 4mb.
const int first_transfer = m_mem.m_vmsize - masked_end;
const int second_transfer = len - first_transfer;
memcpy(mem, &m_tr.buff[masked_end], first_transfer);
memcpy(&mem[first_transfer], &m_tr.buff, second_transfer);
m_tr.end += len;
}
else
{
memcpy(mem, &m_tr.buff[m_tr.end], len);
m_tr.end += len;
}
}
void GSState::PurgeTextureCache(bool sources, bool targets, bool hash_cache)
{
}
void GSState::ReadbackTextureCache()
{
}
template void GSState::Transfer<0>(const u8* mem, u32 size);
template void GSState::Transfer<1>(const u8* mem, u32 size);
template void GSState::Transfer<2>(const u8* mem, u32 size);
template void GSState::Transfer<3>(const u8* mem, u32 size);
template <int index>
void GSState::Transfer(const u8* mem, u32 size)
{
const u8* start = mem;
GIFPath& path = m_path[index];
while (size > 0)
{
if (path.nloop == 0)
{
path.SetTag(mem);
mem += sizeof(GIFTag);
size--;
// eeuser 7.2.2. GIFtag:
// "... when NLOOP is 0, the GIF does not output anything, and values other than the EOP field are disregarded."
if (path.nloop > 0)
{
m_q = 1.0f;
// ASSERT(!(path.tag.PRE && path.tag.FLG == GIF_FLG_REGLIST)); // kingdom hearts
if (path.tag.PRE && path.tag.FLG == GIF_FLG_PACKED)
ApplyPRIM(path.tag.PRIM);
}
}
else
{
u32 total;
switch (path.tag.FLG)
{
case GIF_FLG_PACKED:
// get to the start of the loop
if (path.reg != 0)
{
do
{
(this->*m_fpGIFPackedRegHandlers[path.GetReg()])((GIFPackedReg*)mem);
mem += sizeof(GIFPackedReg);
size--;
} while (path.StepReg() && size > 0 && path.reg != 0);
}
// all data available? usually is
total = path.nloop * path.nreg;
if (size >= total)
{
size -= total;
switch (path.type)
{
case GIFPath::TYPE_UNKNOWN:
{
u32 reg = 0;
do
{
(this->*m_fpGIFPackedRegHandlers[path.GetReg(reg++)])((GIFPackedReg*)mem);
mem += sizeof(GIFPackedReg);
reg = reg & ((int)(reg - path.nreg) >> 31); // resets reg back to 0 when it becomes equal to path.nreg
} while (--total > 0);
}
break;
case GIFPath::TYPE_ADONLY: // very common
do
{
(this->*m_fpGIFRegHandlers[((GIFPackedReg*)mem)->A_D.ADDR & 0x7F])(&((GIFPackedReg*)mem)->r);
mem += sizeof(GIFPackedReg);
} while (--total > 0);
break;
case GIFPath::TYPE_STQRGBAXYZF2: // majority of the vertices are formatted like this
(this->*m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZF2])((GIFPackedReg*)mem, total);
mem += total * sizeof(GIFPackedReg);
break;
case GIFPath::TYPE_STQRGBAXYZ2:
(this->*m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZ2])((GIFPackedReg*)mem, total);
mem += total * sizeof(GIFPackedReg);
break;
default:
ASSUME(0);
}
path.nloop = 0;
}
else
{
do
{
(this->*m_fpGIFPackedRegHandlers[path.GetReg()])((GIFPackedReg*)mem);
mem += sizeof(GIFPackedReg);
size--;
} while (path.StepReg() && size > 0);
}
break;
case GIF_FLG_REGLIST:
// TODO: do it similar to packed operation
size *= 2;
do
{
(this->*m_fpGIFRegHandlers[path.GetReg() & 0x7F])((GIFReg*)mem);
mem += sizeof(GIFReg);
size--;
} while (path.StepReg() && size > 0);
if (size & 1)
mem += sizeof(GIFReg);
size /= 2;
break;
case GIF_FLG_IMAGE2:
// hmmm
// Fall through here fixes a crash in Wallace and Gromit Project Zoo
// and according to Pseudonym we shouldn't even land in this code. So hmm indeed. (rama)
case GIF_FLG_IMAGE:
{
const int len = (int)std::min(size, path.nloop);
switch (m_env.TRXDIR.XDIR)
{
case 0:
Write(mem, len * 16);
break;
case 2:
Move();
break;
default: // 1 and 3
// 1 is invalid because downloads can only be done
// with a reverse fifo operation (vif)
// 3 is spec prohibited, it's behavior is not known
// lets do nothing for now
break;
}
mem += len * 16;
path.nloop -= len;
size -= len;
break;
}
default:
ASSUME(0);
}
}
if (index == 0)
{
if (path.tag.EOP && path.nloop == 0)
break;
}
}
if (m_dump && mem > start)
m_dump->Transfer(index, start, mem - start);
if (index == 0)
{
if (size == 0 && path.nloop > 0)
{
// Hackfix for BIOS, which sends an incomplete packet when it does an XGKICK without
// having an EOP specified anywhere in VU1 memory. Needed until PCSX2 is fixed to
// handle it more properly (ie, without looping infinitely).
path.nloop = 0;
}
}
}
template <class T>
static void WriteState(u8*& dst, T* src, size_t len = sizeof(T))
{
memcpy(dst, src, len);
dst += len;
}
template <class T>
static void ReadState(T* dst, u8*& src, size_t len = sizeof(T))
{
memcpy(dst, src, len);
src += len;
}
int GSState::Freeze(freezeData* fd, bool sizeonly)
{
const u32 version = STATE_VERSION;
if (sizeonly)
{
fd->size = GetSaveStateSize(version);
return 0;
}
if (!fd->data || fd->size < GetSaveStateSize(version))
return -1;
Flush(GSFlushReason::SAVESTATE);
if (GSConfig.UserHacks_ReadTCOnClose)
ReadbackTextureCache();
u8* data = fd->data;
WriteState(data, &version);
WriteState(data, &m_env.PRIM);
WriteState(data, &m_env.PRMODECONT);
WriteState(data, &m_env.TEXCLUT);
WriteState(data, &m_env.SCANMSK);
WriteState(data, &m_env.TEXA);
WriteState(data, &m_env.FOGCOL);
WriteState(data, &m_env.DIMX);
WriteState(data, &m_env.DTHE);
WriteState(data, &m_env.COLCLAMP);
WriteState(data, &m_env.PABE);
WriteState(data, &m_env.BITBLTBUF);
WriteState(data, &m_env.TRXDIR);
WriteState(data, &m_env.TRXPOS);
WriteState(data, &m_env.TRXREG);
WriteState(data, &m_env.TRXREG); // obsolete
for (int i = 0; i < 2; i++)
{
WriteState(data, &m_env.CTXT[i].XYOFFSET);
WriteState(data, &m_env.CTXT[i].TEX0);
WriteState(data, &m_env.CTXT[i].TEX1);
WriteState(data, &m_env.CTXT[i].CLAMP);
WriteState(data, &m_env.CTXT[i].MIPTBP1);
WriteState(data, &m_env.CTXT[i].MIPTBP2);
WriteState(data, &m_env.CTXT[i].SCISSOR);
WriteState(data, &m_env.CTXT[i].ALPHA);
WriteState(data, &m_env.CTXT[i].TEST);
WriteState(data, &m_env.CTXT[i].FBA);
WriteState(data, &m_env.CTXT[i].FRAME);
WriteState(data, &m_env.CTXT[i].ZBUF);
}
WriteState(data, &m_v.RGBAQ);
WriteState(data, &m_v.ST);
WriteState(data, &m_v.UV);
WriteState(data, &m_v.FOG);
WriteState(data, &m_v.XYZ);
data += sizeof(GIFReg); // obsolite
WriteState(data, &m_tr.x);
WriteState(data, &m_tr.y);
// Version 9 up.
WriteState(data, &m_tr.w);
WriteState(data, &m_tr.h);
WriteState(data, &m_tr.m_blit);
WriteState(data, &m_tr.m_pos);
WriteState(data, &m_tr.m_reg);
WriteState(data, &m_tr.rect);
WriteState(data, &m_tr.total);
WriteState(data, &m_tr.start);
WriteState(data, &m_tr.end);
WriteState(data, &m_tr.write);
// End of version 9 changes.
WriteState(data, m_mem.m_vm8, m_mem.m_vmsize);
for (GIFPath& path : m_path)
{
path.tag.NREG = path.nreg;
path.tag.NLOOP = path.nloop;
path.tag.REGS = 0;
for (size_t j = 0; j < std::size(path.regs.U8); j++)
{
path.tag.U32[2 + (j >> 3)] |= path.regs.U8[j] << ((j & 7) << 2);
}
WriteState(data, &path.tag);
WriteState(data, &path.reg);
}
WriteState(data, &m_q);
return 0;
}
int GSState::Defrost(const freezeData* fd)
{
if (!fd || !fd->data || fd->size == 0)
return -1;
u8* data = fd->data;
u32 version;
ReadState(&version, data);
if (fd->size < GetSaveStateSize(version))
return -1;
if (version > STATE_VERSION)
{
Console.Error("GS: Savestate version is incompatible. Load aborted.");
return -1;
}
Flush(GSFlushReason::LOADSTATE);
Reset(true);
ReadState(&m_env.PRIM, data);
if (version <= 6)
data += sizeof(GIFRegPRMODE);
ReadState(&m_env.PRMODECONT, data);
ReadState(&m_env.TEXCLUT, data);
ReadState(&m_env.SCANMSK, data);
ReadState(&m_env.TEXA, data);
ReadState(&m_env.FOGCOL, data);
ReadState(&m_env.DIMX, data);
ReadState(&m_env.DTHE, data);
ReadState(&m_env.COLCLAMP, data);
ReadState(&m_env.PABE, data);
ReadState(&m_env.BITBLTBUF, data);
ReadState(&m_env.TRXDIR, data);
ReadState(&m_env.TRXPOS, data);
ReadState(&m_env.TRXREG, data);
ReadState(&m_env.TRXREG, data); // obsolete
for (int i = 0; i < 2; i++)
{
ReadState(&m_env.CTXT[i].XYOFFSET, data);
ReadState(&m_env.CTXT[i].TEX0, data);
ReadState(&m_env.CTXT[i].TEX1, data);
if (version <= 6)
data += sizeof(GIFRegTEX2);
ReadState(&m_env.CTXT[i].CLAMP, data);
ReadState(&m_env.CTXT[i].MIPTBP1, data);
ReadState(&m_env.CTXT[i].MIPTBP2, data);
ReadState(&m_env.CTXT[i].SCISSOR, data);
ReadState(&m_env.CTXT[i].ALPHA, data);
ReadState(&m_env.CTXT[i].TEST, data);
ReadState(&m_env.CTXT[i].FBA, data);
ReadState(&m_env.CTXT[i].FRAME, data);
ReadState(&m_env.CTXT[i].ZBUF, data);
m_env.CTXT[i].XYOFFSET.OFX &= 0xffff;
m_env.CTXT[i].XYOFFSET.OFY &= 0xffff;
if (version <= 4)
data += sizeof(u32) * 7; // skip
}
ReadState(&m_v.RGBAQ, data);
ReadState(&m_v.ST, data);
ReadState(&m_v.UV, data);
ReadState(&m_v.FOG, data);
ReadState(&m_v.XYZ, data);
data += sizeof(GIFReg); // obsolite
ReadState(&m_tr.x, data);
ReadState(&m_tr.y, data);
if (version >= 9)
{
ReadState(&m_tr.w, data);
ReadState(&m_tr.h, data);
ReadState(&m_tr.m_blit, data);
ReadState(&m_tr.m_pos, data);
ReadState(&m_tr.m_reg, data);
ReadState(&m_tr.rect, data);
ReadState(&m_tr.total, data);
ReadState(&m_tr.start, data);
ReadState(&m_tr.end, data);
ReadState(&m_tr.write, data);
}
else
{
m_tr.w = m_env.TRXREG.RRW;
m_tr.h = m_env.TRXREG.RRH;
m_tr.m_blit = m_env.BITBLTBUF;
m_tr.m_pos = m_env.TRXPOS;
m_tr.m_reg = m_env.TRXREG;
// Assume the last transfer was a write (but nuke it).
m_tr.rect = GSVector4i(m_env.TRXPOS.DSAX, m_env.TRXPOS.DSAY, m_env.TRXPOS.DSAX + m_tr.w, m_env.TRXPOS.DSAY + m_tr.h);
m_tr.total = 0;
m_tr.start = 0;
m_tr.end = 0;
m_tr.write = true;
}
ReadState(m_mem.m_vm8, data, m_mem.m_vmsize);
for (GIFPath& path : m_path)
{
ReadState(&path.tag, data);
ReadState(&path.reg, data);
path.SetTag(&path.tag); // expand regs
}
ReadState(&m_q, data);
m_prev_env = m_env;
PRIM = &m_env.PRIM;
UpdateContext();
UpdateVertexKick();
for (u32 i = 0; i < 2; i++)
{
m_env.CTXT[i].UpdateScissor();
m_env.CTXT[i].offset.fb = m_mem.GetOffset(m_env.CTXT[i].FRAME.Block(), m_env.CTXT[i].FRAME.FBW, m_env.CTXT[i].FRAME.PSM);
m_env.CTXT[i].offset.zb = m_mem.GetOffset(m_env.CTXT[i].ZBUF.Block(), m_env.CTXT[i].FRAME.FBW, m_env.CTXT[i].ZBUF.PSM);
m_env.CTXT[i].offset.fzb4 = m_mem.GetPixelOffset4(m_env.CTXT[i].FRAME, m_env.CTXT[i].ZBUF);
}
UpdateScissor();
// Force CLUT to be reloaded.
m_mem.m_clut.Reset();
(PRIM->CTXT == 0) ? ApplyTEX0<0>(m_context->TEX0) : ApplyTEX0<1>(m_context->TEX0);
g_perfmon.SetFrame(0);
ResetPCRTC();
return 0;
}
//
void GSState::UpdateContext()
{
const bool ctx_switch = (m_context != &m_draw_env->CTXT[PRIM->CTXT]);
if (ctx_switch)
GL_REG("Context Switch %d", PRIM->CTXT);
m_context = const_cast<GSDrawingContext*>(&m_draw_env->CTXT[PRIM->CTXT]);
UpdateScissor();
}
void GSState::UpdateScissor()
{
m_scissor_cull_min = m_context->scissor.cull.xyxy();
m_scissor_cull_max = m_context->scissor.cull.zwzw();
m_xyof = m_context->scissor.xyof;
m_scissor_invalid = !m_context->scissor.in.gt32(m_context->scissor.in.zwzw()).allfalse();
}
void GSState::UpdateVertexKick()
{
const u32 prim = PRIM->PRIM;
m_fpGIFPackedRegHandlers[GIF_REG_XYZF2] = m_fpGIFPackedRegHandlerXYZ[prim][0];
m_fpGIFPackedRegHandlers[GIF_REG_XYZF3] = m_fpGIFPackedRegHandlerXYZ[prim][1];
m_fpGIFPackedRegHandlers[GIF_REG_XYZ2] = m_fpGIFPackedRegHandlerXYZ[prim][2];
m_fpGIFPackedRegHandlers[GIF_REG_XYZ3] = m_fpGIFPackedRegHandlerXYZ[prim][3];
m_fpGIFRegHandlers[GIF_A_D_REG_XYZF2] = m_fpGIFRegHandlerXYZ[prim][0];
m_fpGIFRegHandlers[GIF_A_D_REG_XYZF3] = m_fpGIFRegHandlerXYZ[prim][1];
m_fpGIFRegHandlers[GIF_A_D_REG_XYZ2] = m_fpGIFRegHandlerXYZ[prim][2];
m_fpGIFRegHandlers[GIF_A_D_REG_XYZ3] = m_fpGIFRegHandlerXYZ[prim][3];
m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZF2] = m_fpGIFPackedRegHandlerSTQRGBAXYZF2[prim];
m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZ2] = m_fpGIFPackedRegHandlerSTQRGBAXYZ2[prim];
}
void GSState::GrowVertexBuffer()
{
const u32 maxcount = std::max<u32>(m_vertex.maxcount * 3 / 2, 10000);
const u32 old_vertex_size = sizeof(GSVertex) * m_vertex.tail;
const u32 new_vertex_size = sizeof(GSVertex) * maxcount;
const u32 old_index_size = sizeof(u16) * m_index.tail;
const u32 new_index_size = sizeof(u16) * maxcount * 6; // Worst case index list is a list of points with vs expansion, 6 indices per point
// Structure describing buffers to reallocate
struct AllocDesc
{
void** pbuff;
u32 old_size;
u32 new_size;
};
const std::array<AllocDesc, 5> alloc_desc = {{
{reinterpret_cast<void**>(&m_vertex.buff), old_vertex_size, new_vertex_size},
// discard contents of buff_copy by setting old_size = 0
{reinterpret_cast<void**>(&m_vertex.buff_copy), 0, new_vertex_size},
{reinterpret_cast<void**>(&m_draw_vertex.buff), old_vertex_size, new_vertex_size},
{reinterpret_cast<void**>(&m_index.buff), old_index_size, new_index_size},
{reinterpret_cast<void**>(&m_draw_index.buff), old_index_size, new_index_size}
}};
// For logging
u32 total_size = 0;
for (const auto& desc : alloc_desc)
total_size += desc.new_size;
// Reallocate each of the needed buffers
for (const auto [pbuff, old_size, new_size] : alloc_desc)
{
void* new_buff = _aligned_malloc(new_size, 32);
if (!new_buff)
{
Console.Error("GS: failed to allocate %zu bytes for vertices and indices.", total_size);
pxFailRel("Memory allocation failed");
}
if (*pbuff)
{
if (old_size)
{
std::memcpy(new_buff, *pbuff, old_size);
}
_aligned_free(*pbuff);
}
*pbuff = new_buff;
}
m_vertex.maxcount = maxcount - 3; // -3 to have some space at the end of the buffer before DrawingKick can grow it
}
bool GSState::TrianglesAreQuads(bool shuffle_check)
{
// If this is a quad, there should only be two distinct values for both X and Y, which
// also happen to be the minimum/maximum bounds of the primitive.
if (!shuffle_check && m_quad_check_valid)
return m_are_quads;
const GSVertex* const v = m_vertex.buff;
m_are_quads = false;
m_quad_check_valid = !shuffle_check;
for (u32 idx = 0; idx < m_index.tail; idx += 6)
{
const u16* const i = m_index.buff + idx;
// Make sure the next set of triangles matches an edge of the previous triangle.
if (idx > 0)
{
const u16* const prev_tri= m_index.buff + (idx - 3);
GIFRegXYZ new_verts[3] = {v[i[0]].XYZ, v[i[1]].XYZ, v[i[2]].XYZ};
if (shuffle_check)
{
new_verts[0].X -= 8 << 4;
new_verts[1].X -= 8 << 4;
new_verts[2].X -= 8 << 4;
}
u32 match_vert_count = 0;
if (!(new_verts[0] != m_vertex.buff[prev_tri[0]].XYZ && new_verts[0] != m_vertex.buff[prev_tri[1]].XYZ && new_verts[0] != m_vertex.buff[prev_tri[2]].XYZ))
match_vert_count++;
if (!(new_verts[1] != m_vertex.buff[prev_tri[0]].XYZ && new_verts[1] != m_vertex.buff[prev_tri[1]].XYZ && new_verts[1] != m_vertex.buff[prev_tri[2]].XYZ))
match_vert_count++;
if (!(new_verts[2] != m_vertex.buff[prev_tri[0]].XYZ && new_verts[2] != m_vertex.buff[prev_tri[1]].XYZ && new_verts[2] != m_vertex.buff[prev_tri[2]].XYZ))
match_vert_count++;
if (match_vert_count != 2)
return false;
}
// Degenerate triangles should've been culled already, so we can check indices.
// This doesn't really make much sense when it's a triangle strip as it will always have 1 extra vert, so check for distinct values for them.
if (PRIM->PRIM != GS_TRIANGLESTRIP)
{
u32 extra_verts = 0;
for (u32 j = 3; j < 6; j++)
{
const u16 tri2_idx = i[j];
if (tri2_idx != i[0] && tri2_idx != i[1] && tri2_idx != i[2])
extra_verts++;
}
if (extra_verts == 1)
continue;
}
else if (m_index.tail == 6)
{
bool shared_vert_found = false;
for (int i = 0; i < 3; i++)
{
for (int j = 3; j < 6; j++)
if (m_vertex.buff[m_index.buff[i]].XYZ.X == m_vertex.buff[m_index.buff[j]].XYZ.X &&
m_vertex.buff[m_index.buff[i]].XYZ.Y == m_vertex.buff[m_index.buff[j]].XYZ.Y)
{
shared_vert_found = true;
break;
}
}
// At least one vert should be shared across otherwise it's 2 separate triangles (false positive from Tales of Destiny).
if (!shared_vert_found)
return false;
const int first_X = m_vertex.buff[m_index.buff[0]].XYZ.X;
const int first_Y = m_vertex.buff[m_index.buff[0]].XYZ.Y;
const int second_X = m_vertex.buff[m_index.buff[1]].XYZ.X;
const int second_Y = m_vertex.buff[m_index.buff[1]].XYZ.Y;
const int third_X = m_vertex.buff[m_index.buff[2]].XYZ.X;
const int third_Y = m_vertex.buff[m_index.buff[2]].XYZ.Y;
const int new_X = m_vertex.buff[m_index.buff[5]].XYZ.X;
const int new_Y = m_vertex.buff[m_index.buff[5]].XYZ.Y;
const int middle_Y = (second_Y >= third_Y) ? (third_Y + ((second_Y - third_Y) / 2)) : (second_Y + ((third_Y - second_Y) / 2));
const int middle_X = (second_X >= third_X) ? (third_X + ((second_X - third_X) / 2)) : (second_X + ((third_X - second_X) / 2));
const bool first_lt_X = first_X <= middle_X;
const bool first_lt_Y = first_Y <= middle_Y;
const bool new_lt_X = new_X <= middle_X;
const bool new_lt_Y = new_Y <= middle_Y;
// Check if verts are on the same side. Not totally accurate, but should be good enough.
if (first_lt_X == new_lt_X && new_lt_Y == first_lt_Y)
return false;
m_prim_overlap = PRIM_OVERLAP_NO;
break;
}
// As a fallback, they might've used different vertices with a tri list, not strip.
// Note that this won't work unless the quad is axis-aligned.
u16 distinct_x_values[2] = {v[i[0]].XYZ.X};
u16 distinct_y_values[2] = {v[i[0]].XYZ.Y};
u32 num_distinct_x_values = 1, num_distinct_y_values = 1;
for (u32 j = 1; j < 6; j++)
{
const GSVertex& jv = v[i[j]];
if (jv.XYZ.X != distinct_x_values[0] && jv.XYZ.X != distinct_x_values[1])
{
if (num_distinct_x_values > 1)
return false;
distinct_x_values[num_distinct_x_values++] = jv.XYZ.X;
}
if (jv.XYZ.Y != distinct_y_values[0] && jv.XYZ.Y != distinct_y_values[1])
{
if (num_distinct_y_values > 1)
return false;
distinct_y_values[num_distinct_y_values++] = jv.XYZ.Y;
}
}
}
m_are_quads = true;
return true;
}
GSState::PRIM_OVERLAP GSState::PrimitiveOverlap()
{
// Either 1 triangle or 1 line or 3 POINTs
// It is bad for the POINTs but low probability that they overlap
if (m_vertex.next < 4)
return PRIM_OVERLAP_NO;
if (m_vt.m_primclass == GS_TRIANGLE_CLASS)
return (m_index.tail == 6 && TrianglesAreQuads()) ? PRIM_OVERLAP_NO : PRIM_OVERLAP_UNKNOW;
else if (m_vt.m_primclass != GS_SPRITE_CLASS)
return PRIM_OVERLAP_UNKNOW; // maybe, maybe not
// Check intersection of sprite primitive only
const u32 count = m_vertex.next;
PRIM_OVERLAP overlap = PRIM_OVERLAP_NO;
const GSVertex* v = m_vertex.buff;
m_drawlist.clear();
u32 i = 0;
while (i < count)
{
// In order to speed up comparison a bounding-box is accumulated. It removes a
// loop so code is much faster (check game virtua fighter). Besides it allow to check
// properly the Y order.
// .x = min(v[i].XYZ.X, v[i+1].XYZ.X)
// .y = min(v[i].XYZ.Y, v[i+1].XYZ.Y)
// .z = max(v[i].XYZ.X, v[i+1].XYZ.X)
// .w = max(v[i].XYZ.Y, v[i+1].XYZ.Y)
GSVector4i all = GSVector4i(v[i].m[1]).upl16(GSVector4i(v[i + 1].m[1])).upl16().xzyw();
all = all.xyxy().blend(all.zwzw(), all > all.zwxy());
u32 j = i + 2;
while (j < count)
{
GSVector4i sprite = GSVector4i(v[j].m[1]).upl16(GSVector4i(v[j + 1].m[1])).upl16().xzyw();
sprite = sprite.xyxy().blend(sprite.zwzw(), sprite > sprite.zwxy());
// Be sure to get vertex in good order, otherwise .r* function doesn't
// work as expected.
pxAssert(sprite.x <= sprite.z);
pxAssert(sprite.y <= sprite.w);
pxAssert(all.x <= all.z);
pxAssert(all.y <= all.w);
if (all.rintersect(sprite).rempty())
{
all = all.runion(sprite);
}
else
{
overlap = PRIM_OVERLAP_YES;
break;
}
j += 2;
}
m_drawlist.push_back((j - i) >> 1); // Sprite count
i = j;
}
#if 0
// Old algo: less constraint but O(n^2) instead of O(n) as above
// You have no guarantee on the sprite order, first vertex can be either top-left or bottom-left
// There is a high probability that the draw call will uses same ordering for all vertices.
// In order to keep a small performance impact only the first sprite will be checked
//
// Some safe-guard will be added in the outer-loop to avoid corruption with a limited perf impact
if (v[1].XYZ.Y < v[0].XYZ.Y) {
// First vertex is Top-Left
for (u32 i = 0; i < count; i += 2) {
if (v[i + 1].XYZ.Y > v[i].XYZ.Y) {
return PRIM_OVERLAP_UNKNOW;
}
GSVector4i vi(v[i].XYZ.X, v[i + 1].XYZ.Y, v[i + 1].XYZ.X, v[i].XYZ.Y);
for (u32 j = i + 2; j < count; j += 2) {
GSVector4i vj(v[j].XYZ.X, v[j + 1].XYZ.Y, v[j + 1].XYZ.X, v[j].XYZ.Y);
GSVector4i inter = vi.rintersect(vj);
if (!inter.rempty()) {
return PRIM_OVERLAP_YES;
}
}
}
}
else {
// First vertex is Bottom-Left
for (u32 i = 0; i < count; i += 2) {
if (v[i + 1].XYZ.Y < v[i].XYZ.Y) {
return PRIM_OVERLAP_UNKNOW;
}
GSVector4i vi(v[i].XYZ.X, v[i].XYZ.Y, v[i + 1].XYZ.X, v[i + 1].XYZ.Y);
for (u32 j = i + 2; j < count; j += 2) {
GSVector4i vj(v[j].XYZ.X, v[j].XYZ.Y, v[j + 1].XYZ.X, v[j + 1].XYZ.Y);
GSVector4i inter = vi.rintersect(vj);
if (!inter.rempty()) {
return PRIM_OVERLAP_YES;
}
}
}
}
#endif
// fprintf(stderr, "%d: Yes, code can be optimized (draw of %d vertices)\n", s_n, count);
return overlap;
}
bool GSState::SpriteDrawWithoutGaps()
{
// Check that the height matches. Xenosaga 3 draws a letterbox around
// the FMV with a sprite at the top and bottom of the framebuffer.
const GSVertex* v = &m_vertex.buff[0];
const int first_dpY = v[1].XYZ.Y - v[0].XYZ.Y;
const int first_dpX = v[1].XYZ.X - v[0].XYZ.X;
// Horizontal Match.
if (((first_dpX + 8) >> 4) == m_r_no_scissor.z)
{
// Borrowed from MergeSprite() modified to calculate heights.
for (u32 i = 2; i < m_vertex.next; i += 2)
{
const int last_pY = v[i - 1].XYZ.Y;
const int dpY = v[i + 1].XYZ.Y - v[i].XYZ.Y;
if (std::abs(dpY - first_dpY) >= 16 || std::abs(static_cast<int>(v[i].XYZ.Y) - last_pY) >= 16)
return false;
}
return true;
}
// Vertical Match.
if (((first_dpY + 8) >> 4) == m_r_no_scissor.w)
{
// Borrowed from MergeSprite().
const int offset_X = m_context->XYOFFSET.OFX;
for (u32 i = 2; i < m_vertex.next; i += 2)
{
const int last_pX = v[i - 1].XYZ.X;
const int this_start_X = v[i].XYZ.X;
const int last_start_X = v[i - 2].XYZ.X;
const int dpX = v[i + 1].XYZ.X - v[i].XYZ.X;
if (this_start_X < last_start_X)
{
const int prev_X = last_start_X - offset_X;
if (std::abs(dpX - prev_X) >= 16 || std::abs(this_start_X - offset_X) >= 16)
return false;
}
else
{
const int dpY = v[i + 1].XYZ.Y - v[i].XYZ.Y;
if ((std::abs(dpY - first_dpY) >= 16 && (i + 2) < m_vertex.next) || std::abs(this_start_X - last_pX) >= 16)
return false;
}
}
return true;
}
// Assume it's small sprites. NFSMW and a few other games draw 32x32 sprites in rows to fill the screen.
if (((first_dpY + 8) >> 4) == GSLocalMemory::m_psm[m_context->FRAME.PSM].pgs.y)
{
int lastXEdge = std::max(v[1].XYZ.X, v[0].XYZ.X);
int lastYEdge = std::max(v[1].XYZ.Y, v[0].XYZ.Y);
for (u32 i = 2; i < m_vertex.next; i += 2)
{
const int dpY = v[i + 1].XYZ.Y - v[i].XYZ.Y;
if (first_dpY != dpY)
return false;
const int newYStart = std::min(v[i + 1].XYZ.Y, v[i].XYZ.Y);
const int newXEdge = std::max(v[i + 1].XYZ.X, v[i].XYZ.X);
if (lastYEdge != newYStart)
{
if (newYStart != static_cast<int>(m_context->XYOFFSET.OFY))
return false;
const int newXStart = std::min(v[i + 1].XYZ.X, v[i].XYZ.X);
if (newXStart != lastXEdge)
return false;
}
else
{
const int dpX = v[i + 1].XYZ.X - v[i].XYZ.X;
if (first_dpX != dpX || lastXEdge != newXEdge)
return false;
}
lastXEdge = newXEdge;
lastYEdge = std::max(v[i + 1].XYZ.Y, v[i].XYZ.Y);
}
m_prim_overlap = PRIM_OVERLAP_NO;
return true;
}
return false;
}
void GSState::CalculatePrimitiveCoversWithoutGaps()
{
m_primitive_covers_without_gaps = FullCover;
// Draw shouldn't be offset.
if (((m_r.eq32(GSVector4i::zero())).mask() & 0xff) != 0xff)
m_primitive_covers_without_gaps = GapsFound;
if (m_vt.m_primclass == GS_POINT_CLASS)
{
m_primitive_covers_without_gaps = (m_vertex.next < 2) ? m_primitive_covers_without_gaps : GapsFound;
return;
}
else if (m_vt.m_primclass == GS_TRIANGLE_CLASS)
{
m_primitive_covers_without_gaps = ((m_index.tail == 6 || ((m_index.tail % 6) == 0 && m_primitive_covers_without_gaps == FullCover)) && TrianglesAreQuads()) ? m_primitive_covers_without_gaps : GapsFound;
return;
}
else if (m_vt.m_primclass != GS_SPRITE_CLASS)
{
m_primitive_covers_without_gaps = GapsFound;
return;
}
// Simple case: one sprite.
if (m_primitive_covers_without_gaps != GapsFound && m_index.tail == 2)
return;
m_primitive_covers_without_gaps = SpriteDrawWithoutGaps() ? (m_primitive_covers_without_gaps == GapsFound ? SpriteNoGaps : m_primitive_covers_without_gaps) : GapsFound;
}
__forceinline bool GSState::IsAutoFlushDraw(u32 prim)
{
if (!PRIM->TME || (GSConfig.UserHacks_AutoFlush == GSHWAutoFlushLevel::SpritesOnly && prim != GS_SPRITE))
return false;
// Not using the same channels.
if (!(GSUtil::GetChannelMask(m_context->TEX0.PSM) & GSUtil::GetChannelMask(m_context->FRAME.PSM, m_context->FRAME.FBMSK | ~(GSLocalMemory::m_psm[m_context->FRAME.PSM].fmsk))))
return false;
// Try to detect shuffles, because these will not autoflush, they by design clash.
if (GSLocalMemory::m_psm[m_context->FRAME.PSM].bpp == 16 && GSLocalMemory::m_psm[m_context->TEX0.PSM].bpp == 16)
{
// Pretty confident here...
GSVertex* buffer = &m_vertex.buff[0];
const bool const_spacing = std::abs(buffer[m_index.buff[0]].U - buffer[m_index.buff[0]].XYZ.X) == std::abs(m_v.U - m_v.XYZ.X) && std::abs(buffer[m_index.buff[1]].XYZ.X - buffer[m_index.buff[0]].XYZ.X) <= 256; // Lequal to 16 pixels apart.
if (const_spacing)
return false;
}
const u32 frame_mask = GSLocalMemory::m_psm[m_context->FRAME.PSM].fmsk;
const bool frame_hit = m_context->FRAME.Block() == m_context->TEX0.TBP0 && !(m_context->TEST.ATE && m_context->TEST.ATST == 0 && m_context->TEST.AFAIL == 2) && ((m_context->FRAME.FBMSK & frame_mask) != frame_mask);
// There's a strange behaviour we need to test on a PS2 here, if the FRAME is a Z format, like Powerdrome something swaps over, and it seems Alpha Fail of "FB Only" writes to the Z.. it's odd.
const bool z_needed = !(m_context->TEST.ATE && m_context->TEST.ATST == 0 && m_context->TEST.AFAIL != 2) && !m_context->ZBUF.ZMSK;
const bool zbuf_hit = (m_context->ZBUF.Block() == m_context->TEX0.TBP0) && z_needed;
const u32 frame_z_psm = frame_hit ? m_context->FRAME.PSM : m_context->ZBUF.PSM;
const u32 frame_z_bp = frame_hit ? m_context->FRAME.Block() : m_context->ZBUF.Block();
if ((frame_hit || zbuf_hit) && GSUtil::HasSharedBits(frame_z_bp, frame_z_psm, m_context->TEX0.TBP0, m_context->TEX0.PSM))
return true;
return false;
}
static constexpr u32 NumIndicesForPrim(u32 prim)
{
switch (prim)
{
case GS_POINTLIST:
case GS_INVALID:
return 1;
case GS_LINELIST:
case GS_SPRITE:
case GS_LINESTRIP:
return 2;
case GS_TRIANGLELIST:
case GS_TRIANGLESTRIP:
case GS_TRIANGLEFAN:
return 3;
default:
return 0;
}
}
static constexpr u32 MaxVerticesForPrim(u32 prim)
{
switch (prim)
{
// Four indices per 1 vertex.
case GS_POINTLIST:
case GS_INVALID:
// Indices are shifted left by 2 to form quads.
case GS_LINELIST:
case GS_LINESTRIP:
return (std::numeric_limits<u16>::max() / 4) - 4;
// Four indices per two vertices.
case GS_SPRITE:
return (std::numeric_limits<u16>::max() / 2) - 2;
case GS_TRIANGLELIST:
case GS_TRIANGLESTRIP:
case GS_TRIANGLEFAN:
default:
return (std::numeric_limits<u16>::max() - 3);
}
}
__forceinline void GSState::CheckCLUTValidity(u32 prim)
{
if (m_mem.m_clut.IsInvalid() & 2)
return;
u32 n = NumIndicesForPrim(prim);
const GSDrawingContext& ctx = m_prev_env.CTXT[m_prev_env.PRIM.CTXT];
if ((m_index.tail > 0 || (m_vertex.tail == n - 1)) && (GSLocalMemory::m_psm[ctx.TEX0.PSM].pal == 0 || !m_prev_env.PRIM.TME))
{
const GSLocalMemory::psm_t& fpsm = GSLocalMemory::m_psm[ctx.FRAME.PSM];
const bool frame_needed = !(ctx.TEST.ATE && ctx.TEST.ATST == 0 && ctx.TEST.AFAIL == 2) && ((ctx.FRAME.FBMSK & fpsm.fmsk) != fpsm.fmsk);
if (frame_needed && GSLocalMemory::m_psm[m_mem.m_clut.GetCLUTCPSM()].bpp == fpsm.bpp)
{
const u32 startbp = fpsm.info.bn(temp_draw_rect.x, temp_draw_rect.y, ctx.FRAME.Block(), ctx.FRAME.FBW);
// If it's a point, then we only have one coord, so the address for start and end will be the same, which is bad for the following check.
u32 endbp = startbp;
// otherwise calculate the end.
if (prim != GS_POINTLIST || (m_index.tail > 1))
endbp = fpsm.info.bn(temp_draw_rect.z - 1, temp_draw_rect.w - 1, ctx.FRAME.Block(), ctx.FRAME.FBW);
m_mem.m_clut.InvalidateRange(startbp, endbp, true);
}
}
}
template<u32 prim>
__forceinline void GSState::HandleAutoFlush()
{
// Kind of a cheat, making the assumption that 2 consecutive fan/strip triangles won't overlap each other (*should* be safe)
if ((m_index.tail & 1) && (prim == GS_TRIANGLESTRIP || prim == GS_TRIANGLEFAN) && !m_texflush_flag)
return;
// To briefly explain what's going on here, what we are checking for is draws over a texture when the source and destination are themselves.
// Because one page of the texture gets buffered in the Texture Cache (the PS2's one) if any of those pixels are overwritten, you still read the old data.
// So we need to calculate if a page boundary is being crossed for the format it is in and if the same part of the texture being written and read inside the draw.
if (IsAutoFlushDraw(prim))
{
int n = 1;
u32 buff[3];
const u32 head = m_vertex.head;
const u32 tail = m_vertex.tail;
switch (prim)
{
case GS_POINTLIST:
buff[0] = tail - 1;
n = 1;
break;
case GS_LINELIST:
case GS_LINESTRIP:
case GS_SPRITE:
buff[0] = tail - 1;
n = 2;
break;
case GS_TRIANGLELIST:
case GS_TRIANGLESTRIP:
buff[0] = tail - 2;
buff[1] = tail - 1;
n = 3;
break;
case GS_TRIANGLEFAN:
buff[0] = head;
buff[1] = tail - 1;
n = 3;
break;
case GS_INVALID:
default:
break;
}
GSVector4i tex_coord;
// Prepare the currently processed vertex.
if (PRIM->FST)
{
tex_coord.x = m_v.U >> 4;
tex_coord.y = m_v.V >> 4;
}
else
{
const float s = std::min((m_v.ST.S / m_v.RGBAQ.Q), 1.0f);
const float t = std::min((m_v.ST.T / m_v.RGBAQ.Q), 1.0f);
tex_coord.x = static_cast<int>((1 << m_context->TEX0.TW) * s);
tex_coord.y = static_cast<int>((1 << m_context->TEX0.TH) * t);
}
GSVector4i tex_rect = tex_coord.xyxy();
const GSLocalMemory::psm_t tex_psm = GSLocalMemory::m_psm[m_context->TEX0.PSM];
const GSLocalMemory::psm_t frame_psm = GSLocalMemory::m_psm[m_context->FRAME.PSM];
// Get the rest of the rect.
for (int i = 0; i < (n - 1); i++)
{
const GSVertex* v = &m_vertex.buff[buff[i]];
if (PRIM->FST)
{
tex_coord.x = v->U >> 4;
tex_coord.y = v->V >> 4;
}
else
{
const float s = std::min((v->ST.S / v->RGBAQ.Q), 1.0f);
const float t = std::min((v->ST.T / v->RGBAQ.Q), 1.0f);
tex_coord.x = static_cast<int>(std::round((1 << m_context->TEX0.TW) * s));
tex_coord.y = static_cast<int>(std::round((1 << m_context->TEX0.TH) * t));
}
tex_rect.x = std::min(tex_rect.x, tex_coord.x);
tex_rect.z = std::max(tex_rect.z, tex_coord.x);
tex_rect.y = std::min(tex_rect.y, tex_coord.y);
tex_rect.w = std::max(tex_rect.w, tex_coord.y);
}
// If the draw was 1 line thick, make it larger as rects are exclusive of ends.
if (tex_rect.x == tex_rect.z)
tex_rect += GSVector4i::cxpr(0, 0, 1, 0);
if (tex_rect.y == tex_rect.w)
tex_rect += GSVector4i::cxpr(0, 0, 0, 1);
// Get the last texture position from the last draw.
const GSVertex* v = &m_vertex.buff[m_index.buff[m_index.tail - 1]];
if (PRIM->FST)
{
tex_coord.x = v->U >> 4;
tex_coord.y = v->V >> 4;
}
else
{
const float s = std::min((v->ST.S / v->RGBAQ.Q), 1.0f);
const float t = std::min((v->ST.T / v->RGBAQ.Q), 1.0f);
tex_coord.x = static_cast<int>(std::round((1 << m_context->TEX0.TW) * s));
tex_coord.y = static_cast<int>(std::round((1 << m_context->TEX0.TH) * t));
}
const int clamp_minu = m_context->CLAMP.MINU;
const int clamp_maxu = m_context->CLAMP.MAXU;
const int clamp_minv = m_context->CLAMP.MINV;
const int clamp_maxv = m_context->CLAMP.MAXV;
switch (m_context->CLAMP.WMS)
{
case CLAMP_REGION_CLAMP:
tex_rect.x = std::max(tex_rect.x, clamp_minu);
tex_rect.z = std::max(tex_rect.z, clamp_minu);
tex_coord.x = std::max(tex_coord.x, clamp_minu);
tex_rect.x = std::min(tex_rect.x, clamp_maxu);
tex_rect.z = std::min(tex_rect.z, clamp_maxu);
tex_coord.x = std::min(tex_coord.x, clamp_maxu);
break;
case CLAMP_REGION_REPEAT:
tex_rect.x = std::max(tex_rect.x, clamp_maxu);
tex_rect.z = std::max(tex_rect.z, clamp_maxu);
tex_coord.x = std::max(tex_coord.x, clamp_maxu);
tex_rect.x = std::min(tex_rect.x, (clamp_maxu | clamp_minu));
tex_rect.z = std::min(tex_rect.z, (clamp_maxu | clamp_minu));
tex_coord.x = std::min(tex_coord.x, (clamp_maxu | clamp_minu));
break;
default:
break;
}
switch (m_context->CLAMP.WMT)
{
case CLAMP_REGION_CLAMP:
tex_rect.y = std::max(tex_rect.y, clamp_minv);
tex_rect.w = std::max(tex_rect.w, clamp_minv);
tex_coord.y = std::max(tex_coord.y, clamp_minv);
tex_rect.y = std::min(tex_rect.y, clamp_maxv);
tex_rect.w = std::min(tex_rect.w, clamp_maxv);
tex_coord.y = std::min(tex_coord.y, clamp_maxv);
break;
case CLAMP_REGION_REPEAT:
tex_rect.y = std::max(tex_rect.y, clamp_maxv);
tex_rect.w = std::max(tex_rect.w, clamp_maxv);
tex_coord.y = std::max(tex_coord.y, clamp_maxv);
tex_rect.y = std::min(tex_rect.y, (clamp_maxv | clamp_minv));
tex_rect.w = std::min(tex_rect.w, (clamp_maxv | clamp_minv));
tex_coord.y = std::min(tex_coord.y, (clamp_maxv | clamp_minv));
break;
default:
break;
}
// Nothing being drawn intersect with the new texture, so no point in checking further.
if (tex_psm.depth == frame_psm.depth && tex_rect.rintersect(temp_draw_rect).rempty())
return;
else if (m_texflush_flag)
{
Flush(GSFlushReason::AUTOFLUSH);
return;
}
const int tex_page_mask_x = ~(tex_psm.pgs.x - 1);
const int tex_page_mask_y = ~(tex_psm.pgs.y - 1);
const GSVector4i tex_page_mask = { tex_page_mask_x, tex_page_mask_y, tex_page_mask_x, tex_page_mask_y };
const GSVector4i last_tex_page = tex_coord.xyxy() & tex_page_mask;
const GSVector4i tex_page = tex_rect.xyxy() & tex_page_mask;
// Crossed page since last draw end
if (!tex_page.eq(last_tex_page))
{
// Make sure the format matches, otherwise the coordinates aren't gonna match, so the draws won't intersect.
if (tex_psm.bpp == frame_psm.bpp && (m_context->FRAME.FBW == m_context->TEX0.TBW))
{
const GSVector2i offset = GSVector2i(m_context->XYOFFSET.OFX, m_context->XYOFFSET.OFY);
const GSVector4i scissor = m_context->scissor.in;
GSVector4i old_draw_rect = GSVector4i::zero();
int current_draw_end = m_index.tail;
while (current_draw_end >= n)
{
for (int i = current_draw_end - 1; i >= current_draw_end - n; i--)
{
const GSVertex* v = &m_vertex.buff[m_index.buff[i]];
if (prim == GS_SPRITE && (i & 1))
{
tex_coord.x = ((static_cast<int>(v->XYZ.X) - offset.x) >> 4) - 1;
tex_coord.y = ((static_cast<int>(v->XYZ.Y) - offset.y) >> 4) - 1;
}
else
{
tex_coord.x = (static_cast<int>(v->XYZ.X) - offset.x) >> 4;
tex_coord.y = (static_cast<int>(v->XYZ.Y) - offset.y) >> 4;
}
if (tex_psm.depth != frame_psm.depth)
{
tex_coord.x ^= (frame_psm.pgs.x / 2);
tex_coord.y ^= (frame_psm.pgs.y / 2);
}
if (prim == GS_SPRITE && (i & 1))
{
tex_coord.x += 1;
tex_coord.y += 1;
}
if (i == (current_draw_end - 1))
{
old_draw_rect = tex_coord.xyxy();
}
else
{
old_draw_rect.x = std::min(old_draw_rect.x, tex_coord.x);
old_draw_rect.z = std::max(old_draw_rect.z, tex_coord.x);
old_draw_rect.y = std::min(old_draw_rect.y, tex_coord.y);
old_draw_rect.w = std::max(old_draw_rect.w, tex_coord.y);
}
}
if (old_draw_rect.x == old_draw_rect.z)
old_draw_rect += GSVector4i::cxpr(0, 0, 1, 0);
if (old_draw_rect.y == old_draw_rect.w)
old_draw_rect += GSVector4i::cxpr(0, 0, 0, 1);
old_draw_rect = tex_rect.rintersect(old_draw_rect);
if (!old_draw_rect.rintersect(scissor).rempty())
{
Flush(GSFlushReason::AUTOFLUSH);
return;
}
current_draw_end -= n;
}
}
else // Storage of the TEX and FRAME/Z is different, so uhh, just fall back to flushing each page. It's slower, sorry.
{
const int frame_width = (m_context->FRAME.FBW * 64) / frame_psm.pgs.x;
const int tex_width = (m_context->TEX0.TBW * 64) / tex_psm.pgs.x;
if ((frame_width == tex_width) || ((tex_rect.w / tex_psm.pgs.y) <= 1 && frame_width >= tex_width))
{
tex_rect += GSVector4i(0, 0, tex_page_mask.z, tex_page_mask.w); // round up to the next page as we will be comparing by page.
//We know we've changed page, so let's set the dimension to cover the page they're in (for different pixel orders)
tex_rect &= tex_page_mask;
tex_rect = GSVector4i(tex_rect.x / tex_psm.pgs.x, tex_rect.y / tex_psm.pgs.y, tex_rect.z / tex_psm.pgs.x, tex_rect.w / tex_psm.pgs.y);
const int frame_page_mask_x = ~(frame_psm.pgs.x - 1);
const int frame_page_mask_y = ~(frame_psm.pgs.y - 1);
const GSVector4i frame_page_mask = { frame_page_mask_x, frame_page_mask_y, frame_page_mask_x, frame_page_mask_y };
GSVector4i area_out = temp_draw_rect;
area_out += GSVector4i(0, 0, frame_page_mask.z, frame_page_mask.w); // round up to the next page as we will be comparing by page.
area_out &= frame_page_mask;
area_out = GSVector4i(area_out.x / frame_psm.pgs.x, area_out.y / frame_psm.pgs.y, area_out.z / frame_psm.pgs.x, area_out.w / frame_psm.pgs.y);
if (!area_out.rintersect(tex_rect).rempty())
Flush(GSFlushReason::AUTOFLUSH);
}
else // Formats are too different so just flush it.
Flush(GSFlushReason::AUTOFLUSH);
}
}
}
}
template <u32 prim, bool auto_flush>
__forceinline void GSState::VertexKick(u32 skip)
{
constexpr u32 n = NumIndicesForPrim(prim);
static_assert(n > 0);
pxAssert(m_vertex.tail < m_vertex.maxcount + 3);
if constexpr (prim == GS_INVALID)
{
m_vertex.tail = m_vertex.head;
return;
}
if (auto_flush && skip == 0 && m_index.tail > 0 && ((m_vertex.tail + 1) - m_vertex.head) >= n)
{
HandleAutoFlush<prim>();
}
u32 head = m_vertex.head;
u32 tail = m_vertex.tail;
u32 next = m_vertex.next;
u32 xy_tail = m_vertex.xy_tail;
// callers should write XYZUVF to m_v.m[1] in one piece to have this load store-forwarded, either by the cpu or the compiler when this function is inlined
const GSVector4i new_v0(m_v.m[0]);
const GSVector4i new_v1(m_v.m[1]);
GSVector4i* RESTRICT tailptr = (GSVector4i*)&m_vertex.buff[tail];
tailptr[0] = new_v0;
tailptr[1] = new_v1;
// We maintain the X/Y coordinates for the last 4 vertices, as well as the head for triangle fans, so we can compute
// the min/max, and cull degenerate triangles, which saves draws in some cases. Why 4? Mod 4 is cheaper than Mod 3.
// These vertices are a full vector containing <X_Fixed_Point, Y_Fixed_Point, X_Integer, Y_Integer>. We use the
// integer coordinates for culling at native resolution, and the fixed point for all others. The XY offset has to be
// applied, then we split it into the fixed/integer portions.
const GSVector4i xy_ofs = new_v1.xxxx().u16to32().sub32(m_xyof);
const GSVector4i xy = xy_ofs.blend32<12>(xy_ofs.sra32<4>());
m_vertex.xy[xy_tail & 3] = xy;
// Backup head for triangle fans so we can read it later, otherwise it'll get lost after the 4th vertex.
if (prim == GS_TRIANGLEFAN && tail == head)
m_vertex.xyhead = xy;
m_vertex.tail = ++tail;
m_vertex.xy_tail = ++xy_tail;
const u32 m = tail - head;
if (m < n)
return;
// Skip draws when scissor is out of range (i.e. bottom-right is less than top-left), since everything will get clipped.
skip |= static_cast<u32>(m_scissor_invalid);
GSVector4i pmin, pmax;
if (skip == 0)
{
const GSVector4i v0 = m_vertex.xy[(xy_tail - 1) & 3];
const GSVector4i v1 = m_vertex.xy[(xy_tail - 2) & 3];
const GSVector4i v2 = (prim == GS_TRIANGLEFAN) ? m_vertex.xyhead : m_vertex.xy[(xy_tail - 3) & 3];
switch (prim)
{
case GS_POINTLIST:
pmin = v0;
pmax = v0;
break;
case GS_LINELIST:
case GS_LINESTRIP:
case GS_SPRITE:
pmin = v0.min_i32(v1);
pmax = v0.max_i32(v1);
break;
case GS_TRIANGLELIST:
case GS_TRIANGLESTRIP:
case GS_TRIANGLEFAN:
pmin = v0.min_i32(v1.min_i32(v2));
pmax = v0.max_i32(v1.max_i32(v2));
break;
default:
break;
}
GSVector4i test = pmax.lt32(m_scissor_cull_min) | pmin.gt32(m_scissor_cull_max);
switch (prim)
{
case GS_TRIANGLELIST:
case GS_TRIANGLESTRIP:
case GS_TRIANGLEFAN:
case GS_SPRITE:
{
// Discard degenerate triangles which don't cover at least one pixel. Since the vertices are in native
// resolution space, we can use the integer locations. When upscaling, we can't, because a primitive which
// does not span a single pixel at 1x may span multiple pixels at higher resolutions.
const GSVector4i degen_test = pmin.eq32(pmax);
test |= m_nativeres ? degen_test.zwzw() : degen_test;
}
break;
default:
break;
}
switch (prim)
{
case GS_TRIANGLELIST:
case GS_TRIANGLESTRIP:
case GS_TRIANGLEFAN:
test = (test | v0.eq64(v1)) | (v1.eq64(v2) | v0.eq64(v2));
break;
default:
break;
}
#ifndef _M_ARM64
// We only care about the xy passing the skip test. zw is the offset coordinates for native culling.
skip |= test.mask() & 0xff;
#else
// mask() is slow on ARM, so just pull the bits out instead, thankfully we only care about the first 4 bytes.
skip |= (static_cast<u64>(test.extract64<0>()) & UINT64_C(0x8080808080808080)) != 0;
#endif
}
if (skip != 0)
{
switch (prim)
{
case GS_POINTLIST:
case GS_LINELIST:
case GS_TRIANGLELIST:
case GS_SPRITE:
m_vertex.tail = head; // no need to check or grow the buffer length
break;
case GS_LINESTRIP:
case GS_TRIANGLESTRIP:
m_vertex.head = head + 1;
[[fallthrough]];
case GS_TRIANGLEFAN:
if (tail >= m_vertex.maxcount)
GrowVertexBuffer(); // in case too many vertices were skipped
break;
default:
ASSUME(0);
}
return;
}
if (tail >= m_vertex.maxcount)
GrowVertexBuffer();
if (m_index.tail == 0 && ((m_backed_up_ctx != m_env.PRIM.CTXT) || m_dirty_gs_regs))
{
const int ctx = m_env.PRIM.CTXT;
std::memcpy(&m_prev_env, &m_env, 88);
std::memcpy(&m_prev_env.CTXT[ctx], &m_env.CTXT[ctx], 96);
std::memcpy(&m_prev_env.CTXT[ctx].offset, &m_env.CTXT[ctx].offset, sizeof(m_env.CTXT[ctx].offset));
std::memcpy(&m_prev_env.CTXT[ctx].scissor, &m_env.CTXT[ctx].scissor, sizeof(m_env.CTXT[ctx].scissor));
m_dirty_gs_regs = 0;
m_backed_up_ctx = m_env.PRIM.CTXT;
}
u16* RESTRICT buff = &m_index.buff[m_index.tail];
switch (prim)
{
case GS_POINTLIST:
buff[0] = static_cast<u16>(head + 0);
m_vertex.head = head + 1;
m_vertex.next = head + 1;
m_index.tail += 1;
break;
case GS_LINELIST:
buff[0] = static_cast<u16>(head + 0);
buff[1] = static_cast<u16>(head + 1);
m_vertex.head = head + 2;
m_vertex.next = head + 2;
m_index.tail += 2;
break;
case GS_LINESTRIP:
if (next < head)
{
m_vertex.buff[next + 0] = m_vertex.buff[head + 0];
m_vertex.buff[next + 1] = m_vertex.buff[head + 1];
head = next;
m_vertex.tail = next + 2;
}
buff[0] = static_cast<u16>(head + 0);
buff[1] = static_cast<u16>(head + 1);
m_vertex.head = head + 1;
m_vertex.next = head + 2;
m_index.tail += 2;
break;
case GS_TRIANGLELIST:
buff[0] = static_cast<u16>(head + 0);
buff[1] = static_cast<u16>(head + 1);
buff[2] = static_cast<u16>(head + 2);
m_vertex.head = head + 3;
m_vertex.next = head + 3;
m_index.tail += 3;
break;
case GS_TRIANGLESTRIP:
if (next < head)
{
m_vertex.buff[next + 0] = m_vertex.buff[head + 0];
m_vertex.buff[next + 1] = m_vertex.buff[head + 1];
m_vertex.buff[next + 2] = m_vertex.buff[head + 2];
head = next;
m_vertex.tail = next + 3;
}
buff[0] = static_cast<u16>(head + 0);
buff[1] = static_cast<u16>(head + 1);
buff[2] = static_cast<u16>(head + 2);
m_vertex.head = head + 1;
m_vertex.next = head + 3;
m_index.tail += 3;
break;
case GS_TRIANGLEFAN:
// TODO: remove gaps, next == head && head < tail - 3 || next > head && next < tail - 2 (very rare)
buff[0] = static_cast<u16>(head + 0);
buff[1] = static_cast<u16>(tail - 2);
buff[2] = static_cast<u16>(tail - 1);
m_vertex.next = tail;
m_index.tail += 3;
break;
case GS_SPRITE:
buff[0] = static_cast<u16>(head + 0);
buff[1] = static_cast<u16>(head + 1);
// Update the first vert's Q for ease of doing Autoflush
if (!m_env.PRIM.FST)
m_vertex.buff[buff[0]].RGBAQ.Q = m_vertex.buff[buff[1]].RGBAQ.Q;
m_vertex.head = head + 2;
m_vertex.next = head + 2;
m_index.tail += 2;
break;
default:
ASSUME(0);
}
// Update rectangle for the current draw. We can use the re-integer coordinates from min/max here.
const GSVector4i draw_min = pmin.zwzw();
const GSVector4i draw_max = pmax;
if (m_vertex.tail != n)
temp_draw_rect = temp_draw_rect.min_i32(draw_min).blend32<12>(temp_draw_rect.max_i32(draw_max));
else
temp_draw_rect = draw_min.blend32<12>(draw_max);
temp_draw_rect = temp_draw_rect.rintersect(m_context->scissor.in);
constexpr u32 max_vertices = MaxVerticesForPrim(prim);
if (max_vertices != 0 && m_vertex.tail >= max_vertices)
Flush(VERTEXCOUNT);
}
/// Checks if region repeat is used (applying it does something to at least one of the values in min...max)
/// Also calculates the real min and max values seen after applying the region repeat to all values in min...max
static bool UsesRegionRepeat(int fix, int msk, int min, int max, int* min_out, int* max_out)
{
if ((min < 0) != (max < 0))
{
// Algorithm doesn't work properly if bits overflow when incrementing (happens on the -1 → 0 crossing)
// Conveniently, crossing zero guarantees you use the full range
*min_out = fix;
*max_out = (fix | msk) + 1;
return true;
}
const int cleared_bits = ~msk & ~fix; // Bits that are always cleared by applying msk and fix
const int set_bits = fix; // Bits that are always set by applying msk and fix
unsigned long msb;
int variable_bits = min ^ max;
if (_BitScanReverse(&msb, variable_bits))
variable_bits |= (1 << msb) - 1; // Fill in all lower bits
const int always_set = min & ~variable_bits; // Bits that are set in every value in min...max
const int sometimes_set = min | variable_bits; // Bits that are set in at least one value in min...max
const bool sets_bits = (set_bits | always_set) != always_set; // At least one bit in min...max is set by applying msk and fix
const bool clears_bits = (cleared_bits & sometimes_set) != 0; // At least one bit in min...max is cleared by applying msk and fix
const int overwritten_variable_bits = (cleared_bits | set_bits) & variable_bits;
// A variable bit that's `0` in `min` will at some point switch to a `1` (because it's variable)
// When it does, all bits below it will switch to a `0` (that's how incrementing works)
// If the 0 to 1 switch is reflected in the final output (not masked and not replaced by a fixed value),
// the final value would be larger than the previous. Otherwise, the final value will be less.
// The true minimum value is `min` with all bits below the most significant replaced variable `0` bit cleared
const int min_overwritten_variable_zeros = ~min & overwritten_variable_bits;
if (_BitScanReverse(&msb, min_overwritten_variable_zeros))
min &= (~0u << msb);
// Similar thing for max, but the first masked `1` bit
const int max_overwritten_variable_ones = max & overwritten_variable_bits;
if (_BitScanReverse(&msb, max_overwritten_variable_ones))
max |= (1 << msb) - 1;
*min_out = (msk & min) | fix;
*max_out = ((msk & max) | fix) + 1;
return sets_bits || clears_bits;
}
GSState::TextureMinMaxResult GSState::GetTextureMinMax(GIFRegTEX0 TEX0, GIFRegCLAMP CLAMP, bool linear, bool clamp_to_tsize)
{
// TODO: some of the +1s can be removed if linear == false
const int tw = TEX0.TW;
const int th = TEX0.TH;
const int w = 1 << tw;
const int h = 1 << th;
const int tw_mask = (1 << tw) - 1;
const int th_mask = (1 << th) - 1;
GSVector4i tr(0, 0, w, h);
const int wms = CLAMP.WMS;
const int wmt = CLAMP.WMT;
const int minu = (int)CLAMP.MINU;
const int minv = (int)CLAMP.MINV;
const int maxu = (int)CLAMP.MAXU;
const int maxv = (int)CLAMP.MAXV;
GSVector4i vr = tr;
switch (wms)
{
case CLAMP_REPEAT:
break;
case CLAMP_CLAMP:
break;
case CLAMP_REGION_CLAMP:
vr.x = minu;
vr.z = maxu + 1;
break;
case CLAMP_REGION_REPEAT:
vr.x = maxu;
vr.z = (maxu | minu) + 1;
break;
default:
ASSUME(0);
}
switch (wmt)
{
case CLAMP_REPEAT:
break;
case CLAMP_CLAMP:
break;
case CLAMP_REGION_CLAMP:
vr.y = minv;
vr.w = maxv + 1;
break;
case CLAMP_REGION_REPEAT:
vr.y = maxv;
vr.w = (maxv | minv) + 1;
break;
default:
ASSUME(0);
}
// Software renderer fixes TEX0 so that TW/TH contain MAXU/MAXV.
// Hardware renderer doesn't, and handles it in the texture cache, so don't clamp here.
if (clamp_to_tsize)
vr = vr.rintersect(tr);
else
tr = tr.runion(vr);
u8 uses_border = 0;
if (m_vt.m_max.t.x >= FLT_MAX || m_vt.m_min.t.x <= -FLT_MAX ||
m_vt.m_max.t.y >= FLT_MAX || m_vt.m_min.t.y <= -FLT_MAX)
{
// If any of the min/max values are +-FLT_MAX we can't rely on them
// so just assume full texture.
uses_border = 0xF;
}
else
{
// Optimisation aims to reduce the amount of texture loaded to only the bit which will be read
GSVector4 st = m_vt.m_min.t.xyxy(m_vt.m_max.t);
if (linear)
{
st += GSVector4(-0.5f, 0.5f).xxyy();
// If it's the start of the texture and our little adjustment is all that pushed it over, clamp it to 0.
// This stops the border check failing when using repeat but needed less than the full texture
// since this was making it take the full texture even though it wasn't needed.
if (!clamp_to_tsize)
{
const u32 mask = (m_vt.m_min.t.floor() == GSVector4::zero()).mask();
if (mask & 1) // X == 0
st.x = st.max(GSVector4::zero()).x;
if (mask & 2) // Y == 0
st.y = st.max(GSVector4::zero()).y;
}
}
// draw will get scissored, adjust UVs to suit
const GSVector2 pos_range(std::max(m_vt.m_max.p.x - m_vt.m_min.p.x, 1.0f), std::max(m_vt.m_max.p.y - m_vt.m_min.p.y, 1.0f));
const GSVector2 uv_range(m_vt.m_max.t.x - m_vt.m_min.t.x, m_vt.m_max.t.y - m_vt.m_min.t.y);
const GSVector2 grad(uv_range / pos_range);
// Adjust texture range when sprites get scissor clipped. Since we linearly interpolate, this
// optimization doesn't work when perspective correction is enabled.
// Allowing for quads when the gradiant is 1. It's not guaranteed (would need to check the grandient on each vector), but should be close enough.
if (m_primitive_covers_without_gaps != NoGapsType::GapsFound && (m_vt.m_primclass == GS_SPRITE_CLASS || (m_vt.m_primclass == GS_TRIANGLE_CLASS && grad.x == 1.0f && grad.y == 1.0f && TrianglesAreQuads(false))))
{
// When coordinates are fractional, GS appears to draw to the right/bottom (effectively
// taking the ceiling), not to the top/left (taking the floor).
const GSVector4i int_rc(m_vt.m_min.p.ceil().xyxy(m_vt.m_max.p.floor()));
const GSVector4i scissored_rc(int_rc.rintersect(m_context->scissor.in));
if (!int_rc.eq(scissored_rc))
{
const GSVertex* vert_first = &m_vertex.buff[m_index.buff[0]];
const GSVertex* vert_second = &m_vertex.buff[m_index.buff[1]];
const GSVertex* vert_third = &m_vertex.buff[m_index.buff[2]];
GSVector4 new_st = st;
bool u_forward_check = false;
bool x_forward_check = false;
if (m_vt.m_primclass == GS_TRIANGLE_CLASS)
{
u_forward_check = PRIM->FST ? ((vert_first->U < vert_second->U) || (vert_first->U < vert_third->U)) : (((vert_first->ST.S / vert_first->RGBAQ.Q) < (vert_second->ST.S / vert_second->RGBAQ.Q)) || ((vert_first->ST.S / vert_first->RGBAQ.Q) < (vert_third->ST.S / vert_third->RGBAQ.Q)));
x_forward_check = (vert_first->XYZ.X < vert_second->XYZ.X) || (vert_first->XYZ.X < vert_third->XYZ.X);
}
else
{
u_forward_check = PRIM->FST ? (vert_first->U < vert_second->U) : ((vert_first->ST.T / vert_first->RGBAQ.Q) < (vert_second->ST.T / vert_first->RGBAQ.Q));
x_forward_check = vert_first->XYZ.Y < vert_second->XYZ.Y;
}
// Check if the UV coords are going in a different direction to the verts, if they match direction, no need to swap
const bool u_forward = u_forward_check;
const bool x_forward = x_forward_check;
const bool swap_x = u_forward != x_forward;
if (int_rc.left < scissored_rc.left)
{
if (!swap_x)
new_st.x += floor(static_cast<float>(scissored_rc.left - int_rc.left) * grad.x);
else
new_st.z -= floor(static_cast<float>(scissored_rc.left - int_rc.left) * grad.x);
}
if (int_rc.right > scissored_rc.right)
{
if (!swap_x)
new_st.z -= floor(static_cast<float>(int_rc.right - scissored_rc.right) * grad.x);
else
new_st.x += floor(static_cast<float>(int_rc.right - scissored_rc.right) * grad.x);
}
// we need to check that it's not going to repeat over the non-clipped part
if (wms != CLAMP_REGION_REPEAT && (wms != CLAMP_REPEAT || (static_cast<int>(new_st.x) & ~tw_mask) == (static_cast<int>(new_st.z - 1) & ~tw_mask)))
{
st.x = new_st.x;
st.z = new_st.z;
}
bool v_forward_check = false;
bool y_forward_check = false;
if (m_vt.m_primclass == GS_TRIANGLE_CLASS)
{
v_forward_check = PRIM->FST ? ((vert_first->V < vert_second->V) || (vert_first->V < vert_third->V)) : (((vert_first->ST.T / vert_first->RGBAQ.Q) < (vert_second->ST.T / vert_second->RGBAQ.Q)) || ((vert_first->ST.T / vert_first->RGBAQ.Q) < (vert_third->ST.T / vert_third->RGBAQ.Q)));
y_forward_check = (vert_first->XYZ.Y < vert_second->XYZ.Y) || (vert_first->XYZ.Y < vert_third->XYZ.Y);
}
else
{
v_forward_check = PRIM->FST ? (vert_first->V < vert_second->V) : ((vert_first->ST.T / vert_first->RGBAQ.Q) < (vert_second->ST.T / vert_first->RGBAQ.Q));
y_forward_check = vert_first->XYZ.Y < vert_second->XYZ.Y;
}
const bool v_forward = v_forward_check;
const bool y_forward = y_forward_check;
const bool swap_y = v_forward != y_forward;
if (int_rc.top < scissored_rc.top)
{
if (!swap_y)
new_st.y += floor(static_cast<float>(scissored_rc.top - int_rc.top) * grad.y);
else
new_st.w -= floor(static_cast<float>(scissored_rc.top - int_rc.top) * grad.y);
}
if (int_rc.bottom > scissored_rc.bottom)
{
if (!swap_y)
new_st.w -= floor(static_cast<float>(int_rc.bottom - scissored_rc.bottom) * grad.y);
else
new_st.y += floor(static_cast<float>(int_rc.bottom - scissored_rc.bottom) * grad.y);
}
if (wmt != CLAMP_REGION_REPEAT && (wmt != CLAMP_REPEAT || (static_cast<int>(new_st.y) & ~th_mask) == (static_cast<int>(new_st.w - 1) & ~th_mask)))
{
st.y = new_st.y;
st.w = new_st.w;
}
}
}
const GSVector4i uv = GSVector4i(st.floor());
uses_border = GSVector4::cast((uv < vr).blend32<0xc>(uv >= vr)).mask();
// Need to make sure we don't oversample, this can cause trouble in grabbing textures.
// This may be inaccurate depending on the draw, but adding 1 all the time is wrong too.
// FIXME: It breaks sw renderer so let's still use 1 for SW mode for now.
const int inclusive_x_req = GSIsHardwareRenderer() ? (((m_vt.m_primclass < GS_TRIANGLE_CLASS) || (grad.x < 1.0f || (grad.x == 1.0f && m_vt.m_max.p.x != floor(m_vt.m_max.p.x)))) ? 1 : 0) : 1;
const int inclusive_y_req = GSIsHardwareRenderer() ? (((m_vt.m_primclass < GS_TRIANGLE_CLASS) || (grad.y < 1.0f || (grad.y == 1.0f && m_vt.m_max.p.y != floor(m_vt.m_max.p.y)))) ? 1 : 0) : 1;
// Roughly cut out the min/max of the read (Clamp)
switch (wms)
{
case CLAMP_REPEAT:
if ((uv.x & ~tw_mask) == (uv.z & ~tw_mask))
{
vr.x = std::max(vr.x, uv.x & tw_mask);
vr.z = std::min(vr.z, (uv.z & tw_mask) + inclusive_x_req);
}
break;
case CLAMP_CLAMP:
case CLAMP_REGION_CLAMP:
if (vr.x < uv.x)
vr.x = std::min(uv.x, vr.z - 1);
if (vr.z > (uv.z + 1))
vr.z = std::max(uv.z, vr.x) + inclusive_x_req;
break;
case CLAMP_REGION_REPEAT:
if (UsesRegionRepeat(maxu, minu, uv.x, uv.z, &vr.x, &vr.z) || maxu >= tw)
uses_border |= TextureMinMaxResult::USES_BOUNDARY_U;
break;
}
switch (wmt)
{
case CLAMP_REPEAT:
if ((uv.y & ~th_mask) == (uv.w & ~th_mask))
{
vr.y = std::max(vr.y, uv.y & th_mask);
vr.w = std::min(vr.w, (uv.w & th_mask) + inclusive_y_req);
}
break;
case CLAMP_CLAMP:
case CLAMP_REGION_CLAMP:
if (vr.y < uv.y)
vr.y = std::min(uv.y, vr.w - 1);
if (vr.w > (uv.w + 1))
vr.w = std::max(uv.w, vr.y) + inclusive_y_req;
break;
case CLAMP_REGION_REPEAT:
if (UsesRegionRepeat(maxv, minv, uv.y, uv.w, &vr.y, &vr.w) || maxv >= th)
uses_border |= TextureMinMaxResult::USES_BOUNDARY_V;
break;
}
}
vr = vr.rintersect(tr);
// This really shouldn't happen now except with the clamping region set entirely outside the texture,
// special handling should be written for that case.
if (vr.rempty())
{
// NOTE: this can happen when texcoords are all outside the texture or clamping area is zero, but we can't
// let the texture cache update nothing, the sampler will still need a single texel from the border somewhere
// examples:
// - THPS (no visible problems)
// - NFSMW (strange rectangles on screen, might be unrelated)
// - Lupin 3rd (huge problems, textures sizes seem to be randomly specified)
const bool inc_x = vr.x < tr.z;
const bool inc_y = vr.y < tr.w;
vr = (vr + GSVector4i(inc_x ? 0 : -1, inc_y ? 0 : -1, inc_x ? 1 : 0, inc_y ? 1 : 0)).rintersect(tr);
}
else if (vr.xxzz().rempty())
{
const bool inc_x = vr.x < tr.z;
vr = (vr + GSVector4i(inc_x ? 0 : -1, 0, inc_x ? 1 : 0, 0)).rintersect(tr);
}
else if (vr.yyww().rempty())
{
const bool inc_y = vr.y < tr.w;
vr = (vr + GSVector4i(0, inc_y ? 0 : -1, 0, inc_y ? 1 : 0)).rintersect(tr);
}
return { vr, uses_border };
}
void GSState::CalcAlphaMinMax(const int tex_alpha_min, const int tex_alpha_max)
{
if (m_vt.m_alpha.valid && tex_alpha_min == 0 && tex_alpha_max == 255)
return;
// We wanted to force an update as we now know the alpha of the non-indexed texture.
// Limit max to 255 as we send 500 when we don't know, makes calculating 24/16bit easier.
int min = tex_alpha_min, max = std::min(tex_alpha_max, 255);
if (IsCoverageAlpha())
{
// HW renderer doesn't currently support AA, so its min is 128.
// If we add AA support to the HW renderer, this will need to be changed.
// (Will probably only be supported with ROV/FBFetch so we would want to check for that.)
min = GSIsHardwareRenderer() ? 128 : 0;
max = 128;
}
else
{
const GSDrawingContext* context = m_context;
GSVector4i a = m_vt.m_min.c.uph32(m_vt.m_max.c).zzww();
if (PRIM->TME && context->TEX0.TCC)
{
const GSDrawingEnvironment& env = *m_draw_env;
switch (GSLocalMemory::m_psm[context->TEX0.PSM].fmt)
{
case 0:
a.y = min;
a.w = max;
break;
case 1:
// If we're using the alpha from the texture, not the whole range, we can just use tex_alpha_min/max.
// AEM and TA0 re precomputed with GSBlock::ReadAndExpandBlock24, so already worked out for tex_alpha.
a.y = (tex_alpha_max < INVALID_ALPHA_MINMAX) ? min : (env.TEXA.AEM ? 0 : env.TEXA.TA0);
a.w = (tex_alpha_max < INVALID_ALPHA_MINMAX) ? max : env.TEXA.TA0;
break;
case 2:
// If we're using the alpha from the texture, not the whole range, we can just use tex_alpha_min/max.
// AEM, TA0 and TA1 are precomputed with GSBlock::ReadAndExpandBlock16, so already worked out for tex_alpha.
a.y = (tex_alpha_max < INVALID_ALPHA_MINMAX) ? min : (env.TEXA.AEM ? 0 : std::min(env.TEXA.TA0, env.TEXA.TA1));
a.w = (tex_alpha_max < INVALID_ALPHA_MINMAX) ? max : std::max(env.TEXA.TA0, env.TEXA.TA1);
break;
case 3:
if (tex_alpha_max < INVALID_ALPHA_MINMAX)
{
a.y = min;
a.w = max;
}
else
{
m_mem.m_clut.GetAlphaMinMax32(a.y, a.w);
}
break;
default:
ASSUME(0);
}
switch (context->TEX0.TFX)
{
case TFX_MODULATE:
a.x = (a.x * a.y) >> 7;
a.z = (a.z * a.w) >> 7;
if (a.x > 0xff)
a.x = 0xff;
if (a.z > 0xff)
a.z = 0xff;
break;
case TFX_DECAL:
a.x = a.y;
a.z = a.w;
break;
case TFX_HIGHLIGHT:
a.x = a.x + a.y;
a.z = a.z + a.w;
if (a.x > 0xff)
a.x = 0xff;
if (a.z > 0xff)
a.z = 0xff;
break;
case TFX_HIGHLIGHT2:
a.x = a.y;
a.z = a.w;
break;
default:
ASSUME(0);
}
}
min = a.x;
max = a.z;
}
m_vt.m_alpha.min = min;
m_vt.m_alpha.max = max;
m_vt.m_alpha.valid = true;
}
void GSState::CorrectATEAlphaMinMax(const u32 atst, const int aref)
{
const GSVertexTrace::VertexAlpha& aminmax = GetAlphaMinMax();
int amin = aminmax.min;
int amax = aminmax.max;
switch (atst)
{
case ATST_LESS:
amin = std::min(amin, std::max(aref - 1, amin));
amax = std::min(amax, std::max(aref - 1, amin));
break;
case ATST_LEQUAL:
amin = std::min(amin, std::max(aref, amin));
amax = std::min(amax, std::max(aref, amin));
break;
case ATST_EQUAL:
amax = aref;
amin = aref;
break;
case ATST_GEQUAL:
amax = std::max(amax, std::min(aref, amax));
amin = std::max(amin, std::min(aref, amax));
break;
case ATST_GREATER:
amax = std::max(amax, std::min(aref + 1, amax));
amin = std::max(amin, std::min(aref + 1, amax));
break;
default:
break;
}
m_vt.m_alpha.min = amin;
m_vt.m_alpha.max = amax;
}
bool GSState::TryAlphaTest(u32& fm, u32& zm)
{
// Shortcut for the easy case
if (m_context->TEST.ATST == ATST_ALWAYS)
return true;
const u32 framemask = GSLocalMemory::m_psm[m_context->FRAME.PSM].fmsk;
const u32 framemaskalpha = GSLocalMemory::m_psm[m_context->FRAME.PSM].fmsk & 0xFF000000;
const u32 fail_type = m_context->TEST.GetAFAIL(m_context->FRAME.PSM);
// Alpha test can only control the write of some channels. If channels are already masked
// the alpha test is therefore a nop.
switch (fail_type)
{
case AFAIL_KEEP:
break;
case AFAIL_FB_ONLY:
if (zm == 0xFFFFFFFF)
return true;
break;
case AFAIL_ZB_ONLY:
if ((fm & framemask) == framemask)
return true;
break;
case AFAIL_RGB_ONLY:
if (zm == 0xFFFFFFFF && (fm & framemaskalpha) == framemaskalpha)
return true;
break;
default:
ASSUME(0);
}
bool pass = true;
if (m_context->TEST.ATST == ATST_NEVER)
{
pass = false; // Shortcut to avoid GetAlphaMinMax below
}
else
{
const GSVertexTrace::VertexAlpha& aminmax = GetAlphaMinMax();
const int amin = aminmax.min;
const int amax = aminmax.max;
const int aref = m_context->TEST.AREF;
switch (m_context->TEST.ATST)
{
case ATST_NEVER:
pass = false;
break;
case ATST_ALWAYS:
pass = true;
break;
case ATST_LESS:
if (amax < aref)
pass = true;
else if (amin >= aref)
pass = false;
else
return false;
break;
case ATST_LEQUAL:
if (amax <= aref)
pass = true;
else if (amin > aref)
pass = false;
else
return false;
break;
case ATST_EQUAL:
if (amin == aref && amax == aref)
pass = true;
else if (amin > aref || amax < aref)
pass = false;
else
return false;
break;
case ATST_GEQUAL:
if (amin >= aref)
pass = true;
else if (amax < aref)
pass = false;
else
return false;
break;
case ATST_GREATER:
if (amin > aref)
pass = true;
else if (amax <= aref)
pass = false;
else
return false;
break;
case ATST_NOTEQUAL:
if (amin == aref && amax == aref)
pass = false;
else if (amin > aref || amax < aref)
pass = true;
else
return false;
break;
default:
ASSUME(0);
}
}
if (!pass)
{
switch (fail_type)
{
case AFAIL_KEEP:
fm = zm = 0xffffffff;
break;
case AFAIL_FB_ONLY:
zm = 0xffffffff;
break;
case AFAIL_ZB_ONLY:
fm = 0xffffffff;
break;
case AFAIL_RGB_ONLY:
fm |= 0xff000000;
zm = 0xffffffff;
break;
default:
ASSUME(0);
}
}
return true;
}
bool GSState::IsOpaque()
{
if (PRIM->AA1)
return false;
if (!PRIM->ABE)
return true;
const GSDrawingContext* context = m_context;
int amin = 0;
int amax = 0xff;
if (context->ALPHA.A != context->ALPHA.B)
{
if (context->ALPHA.C == 0)
{
amin = GetAlphaMinMax().min;
amax = GetAlphaMinMax().max;
}
else if (context->ALPHA.C == 1)
{
if (context->FRAME.PSM == PSMCT24 || context->FRAME.PSM == PSMZ24)
amin = amax = 0x80;
}
else if (context->ALPHA.C == 2)
{
amin = amax = context->ALPHA.FIX;
}
}
return context->ALPHA.IsOpaque(amin, amax);
}
bool GSState::IsMipMapDraw()
{
return m_context->TEX1.MXL > 0 && m_context->TEX1.MMIN >= 2 && m_context->TEX1.MMIN <= 5 && m_vt.m_lod.y > 0;
}
bool GSState::IsMipMapActive()
{
return m_mipmap && IsMipMapDraw();
}
bool GSState::IsCoverageAlpha()
{
return !PRIM->ABE && PRIM->AA1 && (m_vt.m_primclass == GS_LINE_CLASS || m_vt.m_primclass == GS_TRIANGLE_CLASS);
}
GIFRegTEX0 GSState::GetTex0Layer(u32 lod)
{
// Shortcut
if (lod == 0)
return m_context->TEX0;
GIFRegTEX0 TEX0 = m_context->TEX0;
switch (lod)
{
case 1:
TEX0.TBP0 = m_context->MIPTBP1.TBP1;
TEX0.TBW = m_context->MIPTBP1.TBW1;
break;
case 2:
TEX0.TBP0 = m_context->MIPTBP1.TBP2;
TEX0.TBW = m_context->MIPTBP1.TBW2;
break;
case 3:
TEX0.TBP0 = m_context->MIPTBP1.TBP3;
TEX0.TBW = m_context->MIPTBP1.TBW3;
break;
case 4:
TEX0.TBP0 = m_context->MIPTBP2.TBP4;
TEX0.TBW = m_context->MIPTBP2.TBW4;
break;
case 5:
TEX0.TBP0 = m_context->MIPTBP2.TBP5;
TEX0.TBW = m_context->MIPTBP2.TBW5;
break;
case 6:
TEX0.TBP0 = m_context->MIPTBP2.TBP6;
TEX0.TBW = m_context->MIPTBP2.TBW6;
break;
default:
Console.Error("GS: Invalid guest lod setting. Please report: https://github.com/PCSX2/pcsx2/issues");
}
// Correct the texture size
if (TEX0.TH <= lod)
TEX0.TH = 0;
else
TEX0.TH -= lod;
if (TEX0.TW <= lod)
TEX0.TW = 0;
else
TEX0.TW -= lod;
return TEX0;
}
// GSTransferBuffer
GSState::GSTransferBuffer::GSTransferBuffer()
{
constexpr size_t alloc_size = 1024 * 1024 * 4;
buff = reinterpret_cast<u8*>(_aligned_malloc(alloc_size, 32));
}
GSState::GSTransferBuffer::~GSTransferBuffer()
{
_aligned_free(buff);
}
void GSState::GSTransferBuffer::Init(GIFRegTRXPOS& TRXPOS, GIFRegTRXREG& TRXREG, const GIFRegBITBLTBUF& blit, bool is_write)
{
x = is_write ? TRXPOS.DSAX : TRXPOS.SSAX;
y = is_write ? TRXPOS.DSAY : TRXPOS.SSAY;
w = TRXREG.RRW;
h = TRXREG.RRH;
rect = GSVector4i(x, y, x + w, y + h);
total = 0;
start = 0;
end = 0;
m_blit = blit;
m_pos = TRXPOS;
m_reg = TRXREG;
write = is_write;
}
bool GSState::GSTransferBuffer::Update(int tw, int th, int bpp, int& len)
{
int tex_size = (((tw * th * bpp) + 7) >> 3); // Round to nearest byte
int packet_size = (tex_size + 15) & ~0xF; // Round up to the nearest quadword
if (total == 0)
total = std::min<int>(tex_size, 1024 * 1024 * 4);
const int remaining = total - end;
if (len > remaining)
{
if (len > packet_size)
{
#if defined(_DEBUG)
Console.Warning("GS transfer buffer overflow len %d remaining %d, tex_size %d tw %d th %d bpp %d", len, remaining, tex_size, tw, th, bpp);
#endif
}
len = remaining;
}
return len > 0;
}
// The horizontal offset values (under z) for PAL and NTSC have been tweaked
// they should be apparently 632 and 652 respectively, but that causes a thick black line on the left
// these values leave a small black line on the right in a bunch of games, but it's not so bad.
// The only conclusion I can come to is there is horizontal overscan expected so there would normally
// be black borders either side anyway, or both sides slightly covered.
static inline constexpr GSVector4i VideoModeOffsets[6] = {
GSVector4i::cxpr(640, 224, 642, 25),
GSVector4i::cxpr(640, 256, 676, 36),
GSVector4i::cxpr(640, 480, 276, 34),
GSVector4i::cxpr(720, 480, 232, 35),
GSVector4i::cxpr(1280, 720, 302, 24),
GSVector4i::cxpr(1920, 540, 238, 40)
};
static inline constexpr GSVector4i VideoModeOffsetsOverscan[6] = {
GSVector4i::cxpr(711, 240, 498, 17),
GSVector4i::cxpr(711, 288, 532, 21),
GSVector4i::cxpr(640, 480, 276, 34),
GSVector4i::cxpr(720, 480, 232, 35),
GSVector4i::cxpr(1280, 720, 302, 24),
GSVector4i::cxpr(1920, 540, 238, 40)
};
static inline constexpr GSVector4i VideoModeDividers[6] = {
GSVector4i::cxpr(3, 0, 2559, 239),
GSVector4i::cxpr(3, 0, 2559, 287),
GSVector4i::cxpr(1, 0, 1279, 479),
GSVector4i::cxpr(1, 0, 1439, 479),
GSVector4i::cxpr(0, 0, 1279, 719),
GSVector4i::cxpr(0, 0, 1919, 1079)
};
bool GSState::GSPCRTCRegs::IsAnalogue()
{
const GSVideoMode video = static_cast<GSVideoMode>(videomode + 1);
return video == GSVideoMode::NTSC || video == GSVideoMode::PAL || video == GSVideoMode::HDTV_1080I;
}
// Calculates which display is closest to matching zero offsets in either direction.
GSVector2i GSState::GSPCRTCRegs::NearestToZeroOffset()
{
GSVector2i returnValue = { 1, 1 };
if (!PCRTCDisplays[0].enabled && !PCRTCDisplays[1].enabled)
return returnValue;
for (int i = 0; i < 2; i++)
{
if (!PCRTCDisplays[i].enabled)
{
returnValue.x = 1 - i;
returnValue.y = 1 - i;
return returnValue;
}
}
if (abs(PCRTCDisplays[0].displayOffset.x - VideoModeOffsets[videomode].z) <
abs(PCRTCDisplays[1].displayOffset.x - VideoModeOffsets[videomode].z))
returnValue.x = 0;
// When interlaced, the vertical base offset is doubled
const int verticalOffset = VideoModeOffsets[videomode].w * (1 << interlaced);
if (abs(PCRTCDisplays[0].displayOffset.y - verticalOffset) <
abs(PCRTCDisplays[1].displayOffset.y - verticalOffset))
returnValue.y = 0;
return returnValue;
}
void GSState::GSPCRTCRegs::SetVideoMode(GSVideoMode videoModeIn)
{
videomode = static_cast<int>(videoModeIn) - 1;
}
// Enable each of the displays.
void GSState::GSPCRTCRegs::EnableDisplays(GSRegPMODE pmode, GSRegSMODE2 smode2, bool smodetoggle)
{
PCRTCDisplays[0].enabled = pmode.EN1;
PCRTCDisplays[1].enabled = pmode.EN2;
interlaced = smode2.INT && IsAnalogue();
FFMD = smode2.FFMD;
toggling_field = smodetoggle && IsAnalogue();
}
void GSState::GSPCRTCRegs::CheckSameSource()
{
if (PCRTCDisplays[0].enabled != PCRTCDisplays[1].enabled || (PCRTCDisplays[0].enabled | PCRTCDisplays[1].enabled) == false)
{
PCRTCSameSrc = false;
return;
}
PCRTCSameSrc = PCRTCDisplays[0].FBP == PCRTCDisplays[1].FBP &&
PCRTCDisplays[0].FBW == PCRTCDisplays[1].FBW &&
GSUtil::HasCompatibleBits(PCRTCDisplays[0].PSM, PCRTCDisplays[1].PSM);
}
bool GSState::GSPCRTCRegs::FrameWrap()
{
const GSVector4i combined_rect = GSVector4i(PCRTCDisplays[0].framebufferRect.runion(PCRTCDisplays[1].framebufferRect));
return combined_rect.w >= 2048 || combined_rect.z >= 2048;
}
// If the start point of both frames match, we can do a single read
bool GSState::GSPCRTCRegs::FrameRectMatch()
{
return PCRTCSameSrc;
}
GSVector2i GSState::GSPCRTCRegs::GetResolution()
{
GSVector2i resolution;
const GSVector4i offsets = !GSConfig.PCRTCOverscan ? VideoModeOffsets[videomode] : VideoModeOffsetsOverscan[videomode];
const bool is_full_height = interlaced || (toggling_field && GSConfig.InterlaceMode != GSInterlaceMode::Off) || GSConfig.InterlaceMode == GSInterlaceMode::Off;
if (!GSConfig.PCRTCOffsets)
{
if (PCRTCDisplays[0].enabled && PCRTCDisplays[1].enabled)
{
const GSVector4i combined_size = PCRTCDisplays[0].displayRect.runion(PCRTCDisplays[1].displayRect);
resolution = { combined_size.width(), combined_size.height() };
}
else if (PCRTCDisplays[0].enabled)
{
resolution = { PCRTCDisplays[0].displayRect.width(), PCRTCDisplays[0].displayRect.height() };
}
else
{
resolution = { PCRTCDisplays[1].displayRect.width(), PCRTCDisplays[1].displayRect.height() };
}
}
else
{
const int shift = is_full_height ? 1 : 0;
resolution = { offsets.x, offsets.y << shift };
}
resolution.x = std::min(resolution.x, offsets.x);
resolution.y = std::min(resolution.y, is_full_height ? offsets.y << 1 : offsets.y);
return resolution;
}
GSVector4i GSState::GSPCRTCRegs::GetFramebufferRect(int display)
{
if (display == -1)
{
return GSVector4i(PCRTCDisplays[0].framebufferRect.runion(PCRTCDisplays[1].framebufferRect));
}
else
{
return PCRTCDisplays[display].framebufferRect;
}
}
int GSState::GSPCRTCRegs::GetFramebufferBitDepth()
{
if (PCRTCDisplays[0].enabled)
return GSLocalMemory::m_psm[PCRTCDisplays[0].PSM].bpp;
else if (PCRTCDisplays[1].enabled)
return GSLocalMemory::m_psm[PCRTCDisplays[1].PSM].bpp;
return 32;
}
GSVector2i GSState::GSPCRTCRegs::GetFramebufferSize(int display)
{
int max_height = !GSConfig.PCRTCOverscan ? VideoModeOffsets[videomode].y : VideoModeOffsetsOverscan[videomode].y;
if (!(FFMD && interlaced))
{
max_height *= 2;
}
if (display == -1)
{
GSVector4i combined_rect = PCRTCDisplays[0].framebufferRect.runion(PCRTCDisplays[1].framebufferRect);
if (combined_rect.z >= 2048)
{
const int high_x = (PCRTCDisplays[0].framebufferRect.x > PCRTCDisplays[1].framebufferRect.x) ? PCRTCDisplays[0].framebufferRect.x : PCRTCDisplays[1].framebufferRect.x;
combined_rect.z -= GSIsHardwareRenderer() ? 2048 : high_x;
combined_rect.x = 0;
}
if (combined_rect.w >= 2048)
{
const int high_y = (PCRTCDisplays[0].framebufferRect.y > PCRTCDisplays[1].framebufferRect.y) ? PCRTCDisplays[0].framebufferRect.y : PCRTCDisplays[1].framebufferRect.y;
combined_rect.w -= GSIsHardwareRenderer() ? 2048 : high_y;
combined_rect.y = 0;
}
// Cap the framebuffer read to the maximum display height, otherwise the hardware renderer gets messy.
const int min_mag = std::max(1, std::min(PCRTCDisplays[0].magnification.y, PCRTCDisplays[1].magnification.y));
int offset = PCRTCDisplays[0].displayRect.runion(PCRTCDisplays[1].displayRect).y;
if (FFMD && interlaced)
{
offset = (offset - 1) / 2;
}
// Hardware mode needs a wider framebuffer as it can't offset the read.
if (GSIsHardwareRenderer())
{
combined_rect.z += std::max(PCRTCDisplays[0].framebufferOffsets.x, PCRTCDisplays[1].framebufferOffsets.x);
combined_rect.w += std::max(PCRTCDisplays[0].framebufferOffsets.y, PCRTCDisplays[1].framebufferOffsets.y);
}
max_height += combined_rect.y;
offset = (max_height / min_mag) - offset;
combined_rect.w = std::min(combined_rect.w, offset);
return GSVector2i(combined_rect.z, combined_rect.w);
}
else
{
GSVector4i out_rect = PCRTCDisplays[display].framebufferRect;
if (out_rect.z >= 2048)
out_rect.z -= out_rect.x;
if (out_rect.w >= 2048)
out_rect.w -= out_rect.y;
// Cap the framebuffer read to the maximum display height, otherwise the hardware renderer gets messy.
const int min_mag = std::max(1, PCRTCDisplays[display].magnification.y);
int offset = PCRTCDisplays[display].displayRect.y;
if (FFMD && interlaced)
{
offset = (offset - 1) / 2;
}
max_height += out_rect.y;
offset = (max_height / min_mag) - offset;
out_rect.w = std::min(out_rect.w, offset);
return GSVector2i(out_rect.z, out_rect.w);
}
}
// Sets up the rectangles for both the framebuffer read and the displays for the merge circuit.
void GSState::GSPCRTCRegs::SetRects(int display, GSRegDISPLAY displayReg, GSRegDISPFB framebufferReg)
{
// Save framebuffer information first, while we're here.
PCRTCDisplays[display].prevFramebufferReg.FBP = PCRTCDisplays[display].FBP;
PCRTCDisplays[display].prevFramebufferReg.FBW = PCRTCDisplays[display].FBW;
PCRTCDisplays[display].prevFramebufferReg.PSM = PCRTCDisplays[display].PSM;
PCRTCDisplays[display].prevFramebufferReg.DBX = PCRTCDisplays[display].DBX;
PCRTCDisplays[display].prevFramebufferReg.DBY = PCRTCDisplays[display].DBY;
PCRTCDisplays[display].FBP = framebufferReg.FBP;
PCRTCDisplays[display].FBW = framebufferReg.FBW;
PCRTCDisplays[display].PSM = framebufferReg.PSM;
PCRTCDisplays[display].DBX = framebufferReg.DBX;
PCRTCDisplays[display].DBY = framebufferReg.DBY;
// Probably not really enabled but will cause a mess.
// Q-Ball Billiards enables both circuits but doesn't set one of them up.
if (PCRTCDisplays[display].FBW == 0 && displayReg.DW == 0 && displayReg.DH == 0 && displayReg.MAGH == 0)
{
PCRTCDisplays[display].enabled = false;
return;
}
PCRTCDisplays[display].magnification = GSVector2i(displayReg.MAGH + 1, displayReg.MAGV + 1);
const u32 DW = displayReg.DW + 1;
const u32 DH = displayReg.DH + 1;
const int renderWidth = DW / PCRTCDisplays[display].magnification.x;
const int renderHeight = DH / PCRTCDisplays[display].magnification.y;
u32 finalDisplayWidth = renderWidth;
u32 finalDisplayHeight = renderHeight;
// When using screen offsets the screen gets squashed/resized in to the actual screen size.
if (GSConfig.PCRTCOffsets)
{
finalDisplayWidth = DW / (VideoModeDividers[videomode].x + 1);
finalDisplayHeight = DH / (VideoModeDividers[videomode].y + 1);
}
else
{
finalDisplayWidth = std::min(finalDisplayWidth ,DW / (VideoModeDividers[videomode].x + 1));
finalDisplayHeight = std::min(finalDisplayHeight, DH / (VideoModeDividers[videomode].y + 1));
}
// Framebuffer size and offsets.
PCRTCDisplays[display].prevFramebufferOffsets = PCRTCDisplays[display].framebufferOffsets;
PCRTCDisplays[display].framebufferRect.x = 0;
PCRTCDisplays[display].framebufferRect.y = 0;
PCRTCDisplays[display].framebufferRect.z = renderWidth;
if(FFMD && interlaced) // Round up the height as if it's an odd value, this will cause havok with the merge circuit.
PCRTCDisplays[display].framebufferRect.w = (renderHeight + 1) >> (FFMD * interlaced); // Half height read if FFMD + INT enabled.
else
PCRTCDisplays[display].framebufferRect.w = renderHeight;
PCRTCDisplays[display].framebufferOffsets.x = framebufferReg.DBX;
PCRTCDisplays[display].framebufferOffsets.y = framebufferReg.DBY;
const bool is_interlaced_resolution = interlaced || (toggling_field && GSConfig.InterlaceMode != GSInterlaceMode::Off);
// If the interlace flag isn't set, but it's still interlacing, the height is likely reported wrong.
// Q-Ball Billiards.
if (is_interlaced_resolution && !interlaced)
finalDisplayHeight *= 2;
// Display size and offsets.
PCRTCDisplays[display].displayRect.x = 0;
PCRTCDisplays[display].displayRect.y = 0;
PCRTCDisplays[display].displayRect.z = finalDisplayWidth;
PCRTCDisplays[display].displayRect.w = finalDisplayHeight;
PCRTCDisplays[display].prevDisplayOffset = PCRTCDisplays[display].displayOffset;
PCRTCDisplays[display].displayOffset.x = displayReg.DX;
PCRTCDisplays[display].displayOffset.y = displayReg.DY;
}
// Calculate framebuffer read offsets, should be considered if only one circuit is enabled, or difference is more than 1 line.
// Only considered if "Anti-blur" is enabled.
void GSState::GSPCRTCRegs::CalculateFramebufferOffset(bool scanmask)
{
if (GSConfig.PCRTCAntiBlur && PCRTCSameSrc && !scanmask)
{
GSVector2i fb0 = GSVector2i(PCRTCDisplays[0].framebufferOffsets.x, PCRTCDisplays[0].framebufferOffsets.y);
GSVector2i fb1 = GSVector2i(PCRTCDisplays[1].framebufferOffsets.x, PCRTCDisplays[1].framebufferOffsets.y);
if (fb0.x + PCRTCDisplays[0].displayRect.z > 2048)
{
fb0.x -= 2048;
fb0.x = abs(fb0.x);
}
if (fb0.y + PCRTCDisplays[0].displayRect.w > 2048)
{
fb0.y -= 2048;
fb0.y = abs(fb0.y);
}
if (fb1.x + PCRTCDisplays[1].displayRect.z > 2048)
{
fb1.x -= 2048;
fb1.x = abs(fb1.x);
}
if (fb1.y + PCRTCDisplays[1].displayRect.w > 2048)
{
fb1.y -= 2048;
fb1.y = abs(fb1.y);
}
if (abs(fb1.y - fb0.y) == 1
&& PCRTCDisplays[0].displayRect.y == PCRTCDisplays[1].displayRect.y)
{
if (fb1.y < fb0.y)
PCRTCDisplays[0].framebufferOffsets.y = fb1.y;
else
PCRTCDisplays[1].framebufferOffsets.y = fb0.y;
}
if (abs(fb1.x - fb0.x) == 1
&& PCRTCDisplays[0].displayRect.x == PCRTCDisplays[1].displayRect.x)
{
if (fb1.x < fb0.x)
PCRTCDisplays[0].framebufferOffsets.x = fb1.x;
else
PCRTCDisplays[1].framebufferOffsets.x = fb0.x;
}
}
PCRTCDisplays[0].framebufferRect.x += PCRTCDisplays[0].framebufferOffsets.x;
PCRTCDisplays[0].framebufferRect.z += PCRTCDisplays[0].framebufferOffsets.x;
PCRTCDisplays[0].framebufferRect.y += PCRTCDisplays[0].framebufferOffsets.y;
PCRTCDisplays[0].framebufferRect.w += PCRTCDisplays[0].framebufferOffsets.y;
PCRTCDisplays[1].framebufferRect.x += PCRTCDisplays[1].framebufferOffsets.x;
PCRTCDisplays[1].framebufferRect.z += PCRTCDisplays[1].framebufferOffsets.x;
PCRTCDisplays[1].framebufferRect.y += PCRTCDisplays[1].framebufferOffsets.y;
PCRTCDisplays[1].framebufferRect.w += PCRTCDisplays[1].framebufferOffsets.y;
}
// Used in software mode to align the buffer when reading. Offset is accounted for (block aligned) by GetOutput.
void GSState::GSPCRTCRegs::RemoveFramebufferOffset(int display)
{
if (display >= 0)
{
// Hardware needs nothing but handling for wrapped framebuffers.
if (GSIsHardwareRenderer())
{
if (PCRTCDisplays[display].framebufferRect.z >= 2048)
{
PCRTCDisplays[display].displayRect.x += 2048 - PCRTCDisplays[display].framebufferRect.x;
PCRTCDisplays[display].displayRect.z += 2048 - PCRTCDisplays[display].framebufferRect.x;
PCRTCDisplays[display].framebufferRect.x = 0;
PCRTCDisplays[display].framebufferRect.z -= 2048;
}
if (PCRTCDisplays[display].framebufferRect.w >= 2048)
{
PCRTCDisplays[display].displayRect.y += 2048 - PCRTCDisplays[display].framebufferRect.y;
PCRTCDisplays[display].displayRect.w += 2048 - PCRTCDisplays[display].framebufferRect.y;
PCRTCDisplays[display].framebufferRect.y = 0;
PCRTCDisplays[display].framebufferRect.w -= 2048;
}
}
else
{
const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[PCRTCDisplays[display].PSM];
// Software mode - See note below.
GSVector4i r = PCRTCDisplays[display].framebufferRect;
r = r.ralign<Align_Outside>(psm.bs);
PCRTCDisplays[display].framebufferRect.z -= r.x;
PCRTCDisplays[display].framebufferRect.w -= r.y;
PCRTCDisplays[display].framebufferRect.x -= r.x;
PCRTCDisplays[display].framebufferRect.y -= r.y;
}
}
else
{
// Software Mode Note:
// This code is to read the framebuffer nicely block aligned in software, then leave the remaining offset in to the block.
// In hardware mode this doesn't happen, it reads the whole framebuffer, so we need to keep the offset.
if (!GSIsHardwareRenderer())
{
const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[PCRTCDisplays[1].PSM];
GSVector4i r = PCRTCDisplays[0].framebufferRect.runion(PCRTCDisplays[1].framebufferRect);
r = r.ralign<Align_Outside>(psm.bs);
PCRTCDisplays[0].framebufferRect.x -= r.x;
PCRTCDisplays[0].framebufferRect.y -= r.y;
PCRTCDisplays[0].framebufferRect.z -= r.x;
PCRTCDisplays[0].framebufferRect.w -= r.y;
PCRTCDisplays[1].framebufferRect.x -= r.x;
PCRTCDisplays[1].framebufferRect.y -= r.y;
PCRTCDisplays[1].framebufferRect.z -= r.x;
PCRTCDisplays[1].framebufferRect.w -= r.y;
}
}
}
// If the two displays are offset from each other, move them to the correct offsets.
// If using screen offsets, calculate the positions here.
void GSState::GSPCRTCRegs::CalculateDisplayOffset(bool scanmask)
{
const bool both_enabled = PCRTCDisplays[0].enabled && PCRTCDisplays[1].enabled;
// Offsets are generally ignored, the "hacky" way of doing the displays, but direct to framebuffers.
if (!GSConfig.PCRTCOffsets)
{
const GSVector4i offsets = !GSConfig.PCRTCOverscan ? VideoModeOffsets[videomode] : VideoModeOffsetsOverscan[videomode];
int int_off[2] = { 0, 0 };
GSVector2i zeroDisplay = NearestToZeroOffset();
GSVector2i baseOffset = PCRTCDisplays[zeroDisplay.y].displayOffset;
if (both_enabled)
{
int blurOffset = abs(PCRTCDisplays[1].displayOffset.y - PCRTCDisplays[0].displayOffset.y);
if (GSConfig.PCRTCAntiBlur && !scanmask && blurOffset < 4)
{
if (PCRTCDisplays[1].displayOffset.y > PCRTCDisplays[0].displayOffset.y)
PCRTCDisplays[1].displayOffset.y -= blurOffset;
else
PCRTCDisplays[0].displayOffset.y -= blurOffset;
}
}
// If there's a single pixel offset, account for it else it can throw interlacing out.
for (int i = 0; i < 2; i++)
{
if (!PCRTCDisplays[i].enabled)
continue;
// Should this be MAGV/H in the DISPLAY register rather than the "default" magnification?
const int offset = (PCRTCDisplays[i].displayOffset.y - (offsets.w * (interlaced + 1))) / (VideoModeDividers[videomode].y + 1);
if (offset > 4)
continue;
int_off[i] = offset & 1;
if (offset < 0)
int_off[i] = -int_off[i];
PCRTCDisplays[i].displayRect.y += int_off[i];
PCRTCDisplays[i].displayRect.w += int_off[i];
}
// Handle difference in offset between the two displays, used in games like DmC and Time Crisis 2 (for split screen).
// Offset is not screen based, but relative to each other.
if (both_enabled)
{
GSVector2i offset = {
(PCRTCDisplays[1 - zeroDisplay.x].displayOffset.x - PCRTCDisplays[zeroDisplay.x].displayOffset.x) / (VideoModeDividers[videomode].x + 1),
(PCRTCDisplays[1 - zeroDisplay.y].displayOffset.y - PCRTCDisplays[zeroDisplay.y].displayOffset.y) / (VideoModeDividers[videomode].y + 1)
};
if (offset.x >= 4 || !GSConfig.PCRTCAntiBlur || scanmask)
{
PCRTCDisplays[1 - zeroDisplay.x].displayRect.x += offset.x;
PCRTCDisplays[1 - zeroDisplay.x].displayRect.z += offset.x;
}
if (offset.y >= 4 || !GSConfig.PCRTCAntiBlur || scanmask)
{
PCRTCDisplays[1 - zeroDisplay.y].displayRect.y += offset.y - int_off[1 - zeroDisplay.y];
PCRTCDisplays[1 - zeroDisplay.y].displayRect.w += offset.y - int_off[1 - zeroDisplay.y];
}
baseOffset = PCRTCDisplays[zeroDisplay.y].displayOffset;
}
// Handle any large vertical offset from the zero position on the screen.
// Example: Hokuto no Ken, does a rougly -14 offset to bring the screen up.
// Ignore the lowest bit, we've already accounted for this
int vOffset = ((static_cast<int>(baseOffset.y) - (offsets.w * (interlaced + 1))) / (VideoModeDividers[videomode].y + 1));
if(vOffset <= 4 && vOffset != 0)
{
PCRTCDisplays[0].displayRect.y += vOffset - int_off[0];
PCRTCDisplays[0].displayRect.w += vOffset - int_off[0];
PCRTCDisplays[1].displayRect.y += vOffset - int_off[1];
PCRTCDisplays[1].displayRect.w += vOffset - int_off[1];
}
}
else // We're using screen offsets, so just calculate the entire offset.
{
const GSVector4i offsets = !GSConfig.PCRTCOverscan ? VideoModeOffsets[videomode] : VideoModeOffsetsOverscan[videomode];
GSVector2i zeroDisplay = NearestToZeroOffset();
if (both_enabled)
{
int blurOffset = abs(PCRTCDisplays[1].displayOffset.y - PCRTCDisplays[0].displayOffset.y);
if (GSConfig.PCRTCAntiBlur && !scanmask && blurOffset < 4)
{
if (PCRTCDisplays[1].displayOffset.y > PCRTCDisplays[0].displayOffset.y)
PCRTCDisplays[1].displayOffset.y -= blurOffset;
else
PCRTCDisplays[0].displayOffset.y -= blurOffset;
}
}
for (int i = 0; i < 2; i++)
{
// Should this be MAGV/H in the DISPLAY register rather than the "default" magnification?
const GSVector2i offset = {
(static_cast<int>(PCRTCDisplays[i].displayOffset.x) - offsets.z) / (VideoModeDividers[videomode].x + 1),
(static_cast<int>(PCRTCDisplays[i].displayOffset.y) - (offsets.w * (interlaced + 1))) / (VideoModeDividers[videomode].y + 1)
};
PCRTCDisplays[i].displayRect.x += offset.x;
PCRTCDisplays[i].displayRect.z += offset.x;
PCRTCDisplays[i].displayRect.y += offset.y;
PCRTCDisplays[i].displayRect.w += offset.y;
}
if (both_enabled)
{
const GSVector2i offset = {
(PCRTCDisplays[1 - zeroDisplay.x].displayRect.x - PCRTCDisplays[zeroDisplay.x].displayRect.x),
(PCRTCDisplays[1 - zeroDisplay.y].displayRect.y - PCRTCDisplays[zeroDisplay.y].displayRect.y)
};
if (offset.x > 0 && offset.x < 4 && GSConfig.PCRTCAntiBlur)
{
PCRTCDisplays[1 - zeroDisplay.x].displayRect.x -= offset.x;
PCRTCDisplays[1 - zeroDisplay.x].displayRect.z -= offset.x;
}
if (offset.y > 0 && offset.y < 4 && GSConfig.PCRTCAntiBlur)
{
PCRTCDisplays[1 - zeroDisplay.y].displayRect.y -= offset.y;
PCRTCDisplays[1 - zeroDisplay.y].displayRect.w -= offset.y;
}
}
}
}