SPU: Emulate voice decode buffers

This makes the timing of NAX advancing more similar to console since it emulates the decode buffer behaviour of it rushing ahead of playback until the buffer is full. It also makes interpolation of the first four samples more correct by using real data instead of the zero filled previous values. [SAVEVERSION+]
2025-12-16 04:08:48 +00:00 · 2025-11-05 22:06:52 +01:00 · 2025-11-05 22:06:52 +01:00 · 8328b4e278
commit 8328b4e278
parent 77dd916a6b
5 changed files with 106 additions and 135 deletions
--- a/pcsx2/SPU2/Debug.cpp
+++ b/pcsx2/SPU2/Debug.cpp
@ -200,7 +200,6 @@ void SPU2::DoFullDump()
 				fprintf(dump, "  - Sound Start Address: %x\n", Cores[c].Voices[v].StartA);
 				fprintf(dump, "  - Next Data Address:   %x\n", Cores[c].Voices[v].NextA);
 				fprintf(dump, "  - Play Status:         %s\n", (Cores[c].Voices[v].ADSR.Phase > 0) ? "Playing" : "Not Playing");
-				fprintf(dump, "  - Block Sample:        %d\n", Cores[c].Voices[v].SCurrent);
 			}
 			fprintf(dump, "#### END OF DUMP.\n\n");
 		}
--- a/pcsx2/SPU2/Mixer.cpp
+++ b/pcsx2/SPU2/Mixer.cpp
@ -89,55 +89,13 @@ int g_counter_cache_ignores = 0;
 #define XAFLAG_LOOP (1ul << 1)
 #define XAFLAG_LOOP_START (1ul << 2)

-static __forceinline s32 GetNextDataBuffered(V_Core& thiscore, uint voiceidx)
+static __forceinline void GetNextDataBuffered(V_Core& thiscore, uint voiceidx)
 {
 	V_Voice& vc(thiscore.Voices[voiceidx]);

-	if ((vc.SCurrent & 3) == 0)
+	if (vc.SBuffer == nullptr)
 	{
-		IncrementNextA(thiscore, voiceidx);
-
-		if ((vc.NextA & 7) == 0) // vc.SCurrent == 24 equivalent
-		{
-			if (vc.LoopFlags & XAFLAG_LOOP_END)
-			{
-				thiscore.Regs.ENDX |= (1 << voiceidx);
-				vc.NextA = vc.LoopStartA | 1;
-				if (!(vc.LoopFlags & XAFLAG_LOOP))
-				{
-					vc.Stop();
-
-					if (IsDevBuild)
-					{
-						if (SPU2::MsgVoiceOff())
-							SPU2::ConLog("* SPU2: Voice Off by EndPoint: %d \n", voiceidx);
-					}
-				}
-			}
-			else
-				vc.NextA++; // no, don't IncrementNextA here.  We haven't read the header yet.
-		}
-	}
-
-	if (vc.SCurrent == 28)
-	{
-		vc.SCurrent = 0;
-
-		// We'll need the loop flags and buffer pointers regardless of cache status:
-
-		for (int i = 0; i < 2; i++)
-			if (Cores[i].IRQEnable && Cores[i].IRQA == (vc.NextA & 0xFFFF8))
-				SetIrqCall(i);
-
-		s16* memptr = GetMemPtr(vc.NextA & 0xFFFF8);
-		vc.LoopFlags = *memptr >> 8; // grab loop flags from the upper byte.
-
-		if ((vc.LoopFlags & XAFLAG_LOOP_START) && !vc.LoopMode)
-		{
-			vc.LoopStartA = vc.NextA & 0xFFFF8;
-		}
-
-		const int cacheIdx = vc.NextA / pcm_WordsPerBlock;
+		const int cacheIdx = (vc.NextA & 0xFFFF8) / pcm_WordsPerBlock;
 		PcmCacheEntry& cacheLine = pcm_cache_data[cacheIdx];
 		vc.SBuffer = cacheLine.Sampledata;

@ -172,46 +130,18 @@ static __forceinline s32 GetNextDataBuffered(V_Core& thiscore, uint voiceidx)
 					g_counter_cache_misses++;
 			}

+
+			s16* memptr = GetMemPtr(vc.NextA & 0xFFFF8);
 			XA_decode_block(vc.SBuffer, memptr, vc.Prev1, vc.Prev2);
 		}
 	}

-	return vc.SBuffer[vc.SCurrent++];
-}
-
-static __forceinline void GetNextDataDummy(V_Core& thiscore, uint voiceidx)
-{
-	V_Voice& vc(thiscore.Voices[voiceidx]);
-
-	IncrementNextA(thiscore, voiceidx);
-
-	if ((vc.NextA & 7) == 0) // vc.SCurrent == 24 equivalent
+	// Get the sample index for NextA, we have to subtract 1 to ignore the loop header
+	int sampleIdx = ((vc.NextA % pcm_WordsPerBlock) - 1) * 4;
+	for (int i = 0; i < 4; i++)
 	{
-		if (vc.LoopFlags & XAFLAG_LOOP_END)
-		{
-			thiscore.Regs.ENDX |= (1 << voiceidx);
-			vc.NextA = vc.LoopStartA | 1;
-		}
-		else
-			vc.NextA++; // no, don't IncrementNextA here.  We haven't read the header yet.
+		vc.DecodeFifo[(vc.DecPosWrite + i) % 32] = vc.SBuffer[sampleIdx + i];
 	}
-
-	if (vc.SCurrent == 28)
-	{
-		for (int i = 0; i < 2; i++)
-			if (Cores[i].IRQEnable && Cores[i].IRQA == (vc.NextA & 0xFFFF8))
-				SetIrqCall(i);
-
-		vc.LoopFlags = *GetMemPtr(vc.NextA & 0xFFFF8) >> 8; // grab loop flags from the upper byte.
-
-		if ((vc.LoopFlags & XAFLAG_LOOP_START) && !vc.LoopMode)
-			vc.LoopStartA = vc.NextA & 0xFFFF8;
-
-		vc.SCurrent = 0;
-	}
-
-	vc.SP -= 0x1000 * (4 - (vc.SCurrent & 3));
-	vc.SCurrent += 4 - (vc.SCurrent & 3);
 }

 /////////////////////////////////////////////////////////////////////////////////////////
@ -237,6 +167,69 @@ static __forceinline StereoOut32 ApplyVolume(const StereoOut32& data, const V_Vo
 		ApplyVolume(data.Right, volume.Right.Value));
 }

+static __forceinline void UpdateBlockHeader(V_Core& thiscore, uint voiceidx)
+{
+	V_Voice& vc(thiscore.Voices[voiceidx]);
+
+	for (int i = 0; i < 2; i++)
+		if (Cores[i].IRQEnable && Cores[i].IRQA == (vc.NextA & 0xFFFF8))
+			SetIrqCall(i);
+
+	s16* memptr = GetMemPtr(vc.NextA & 0xFFFF8);
+	vc.LoopFlags = *memptr >> 8; // grab loop flags from the upper byte.
+
+	if ((vc.LoopFlags & XAFLAG_LOOP_START) && !vc.LoopMode)
+	{
+		vc.LoopStartA = vc.NextA & 0xFFFF8;
+	}
+}
+
+static __forceinline void DecodeSamples(uint coreidx, uint voiceidx)
+{
+	V_Core& thiscore(Cores[coreidx]);
+	V_Voice& vc(thiscore.Voices[voiceidx]);
+
+	// Update the block header on every audio frame
+	UpdateBlockHeader(thiscore, voiceidx);
+
+	// When a voice is started at 0 pitch, NAX quickly advances to SSA + 5
+	// So that would mean the decode buffer holds around 12 samples
+	if (((int)(vc.DecPosWrite - vc.DecPosRead)) > 12) {
+		// Sufficient data buffered
+		return;
+	}
+
+	if (vc.ADSR.Phase > V_ADSR::PHASE_STOPPED)
+	{
+		GetNextDataBuffered(thiscore, voiceidx);
+	}
+
+	vc.DecPosWrite += 4;
+
+	IncrementNextA(thiscore, voiceidx);
+	if ((vc.NextA & 7) == 0)
+	{
+		if (vc.LoopFlags & XAFLAG_LOOP_END)
+		{
+			thiscore.Regs.ENDX |= (1 << voiceidx);
+			vc.NextA = vc.LoopStartA;
+			if (!(vc.LoopFlags & XAFLAG_LOOP))
+			{
+				vc.Stop();
+
+				if (IsDevBuild)
+				{
+					if (SPU2::MsgVoiceOff())
+						SPU2::ConLog("* SPU2: Voice Off by EndPoint: %d \n", voiceidx);
+				}
+			}
+		}
+
+		IncrementNextA(thiscore, voiceidx);
+		vc.SBuffer = nullptr;
+	}
+}
+
 static void __forceinline UpdatePitch(uint coreidx, uint voiceidx)
 {
 	V_Voice& vc(Cores[coreidx].Voices[voiceidx]);
@ -278,33 +271,27 @@ static __forceinline void CalculateADSR(V_Core& thiscore, uint voiceidx)
 	pxAssume(vc.ADSR.Value >= 0); // ADSR should never be negative...
 }

-__forceinline static s32 GaussianInterpolate(s32 pv4, s32 pv3, s32 pv2, s32 pv1, s32 i)
+static __forceinline void ConsumeSamples(V_Core& thiscore, uint voiceidx)
 {
-	s32 out = 0;
-	out += (interpTable[i][0] * pv4) >> 15;
-	out += (interpTable[i][1] * pv3) >> 15;
-	out += (interpTable[i][2] * pv2) >> 15;
-	out += (interpTable[i][3] * pv1) >> 15;
+	V_Voice& vc(thiscore.Voices[voiceidx]);

-	return out;
+	int consumed = vc.SP >> 12;
+	vc.SP &= 0xfff;
+	vc.DecPosRead += consumed;
 }

 static __forceinline s32 GetVoiceValues(V_Core& thiscore, uint voiceidx)
 {
 	V_Voice& vc(thiscore.Voices[voiceidx]);

-	while (vc.SP >= 0)
-	{
-		vc.PV4 = vc.PV3;
-		vc.PV3 = vc.PV2;
-		vc.PV2 = vc.PV1;
-		vc.PV1 = GetNextDataBuffered(thiscore, voiceidx);
-		vc.SP -= 0x1000;
-	}
+	int phase = (vc.SP & 0x0ff0) >> 4;
+	s32 out = 0;
+	out += (interpTable[phase][0] * vc.DecodeFifo[(vc.DecPosRead + 0) % 32]) >> 15;
+	out += (interpTable[phase][1] * vc.DecodeFifo[(vc.DecPosRead + 1) % 32]) >> 15;
+	out += (interpTable[phase][2] * vc.DecodeFifo[(vc.DecPosRead + 2) % 32]) >> 15;
+	out += (interpTable[phase][3] * vc.DecodeFifo[(vc.DecPosRead + 3) % 32]) >> 15;

-	const s32 mu = vc.SP + 0x1000;
-
-	return GaussianInterpolate(vc.PV4, vc.PV3, vc.PV2, vc.PV1, (mu & 0x0ff0) >> 4);
+	return out;
 }

 // This is Dr. Hell's noise algorithm as implemented in pcsxr
@ -382,21 +369,13 @@ static __forceinline StereoOut32 MixVoice(uint coreidx, uint voiceidx)
 	V_Core& thiscore(Cores[coreidx]);
 	V_Voice& vc(thiscore.Voices[voiceidx]);

-	// If this assertion fails, it mans SCurrent is being corrupted somewhere, or is not initialized
-	// properly.  Invalid values in SCurrent will cause errant IRQs and corrupted audio.
-	pxAssertMsg((vc.SCurrent <= 28) && (vc.SCurrent != 0), "Current sample should always range from 1->28");
-
 	// Most games don't use much volume slide effects.  So only call the UpdateVolume
 	// methods when needed by checking the flag outside the method here...
 	// (Note: Ys 6 : Ark of Nephistm uses these effects)

 	vc.Volume.Update();

-	// SPU2 Note: The spu2 continues to process voices for eternity, always, so we
-	// have to run through all the motions of updating the voice regardless of it's
-	// audible status.  Otherwise IRQs might not trigger and emulation might fail.
-
-	UpdatePitch(coreidx, voiceidx);
+	DecodeSamples(coreidx, voiceidx);

 	StereoOut32 voiceOut(0, 0);
 	s32 Value = 0;
@ -419,11 +398,14 @@ static __forceinline StereoOut32 MixVoice(uint coreidx, uint voiceidx)

 		voiceOut = ApplyVolume(StereoOut32(Value, Value), vc.Volume);
 	}
-	else
-	{
-		while (vc.SP >= 0)
-			GetNextDataDummy(thiscore, voiceidx); // Dummy is enough
-	}
+
+	// SPU2 Note: The spu2 continues to process voices for eternity, always, so we
+	// have to run through all the motions of updating the voice regardless of it's
+	// audible status.  Otherwise IRQs might not trigger and emulation might fail.
+
+	UpdatePitch(coreidx, voiceidx);
+
+	ConsumeSamples(thiscore, voiceidx);

 	// Write-back of raw voice data (post ADSR applied)
 	if (voiceidx == 1)
--- a/pcsx2/SPU2/defs.h
+++ b/pcsx2/SPU2/defs.h
@ -256,22 +256,16 @@ struct V_Voice
 	// Sample pointer (19:12 bit fixed point)
 	s32 SP;

-	// Previous sample values - used for interpolation
-	// Inverted order of these members to match the access order in the
-	//   code (might improve cache hits).
-	s32 PV4;
-	s32 PV3;
-	s32 PV2;
-	s32 PV1;
-
 	// Last outputted audio value, used for voice modulation.
 	s32 OutX;

 	// SBuffer now points directly to an ADPCM cache entry.
 	s16* SBuffer;

-	// sample position within the current decoded packet.
-	s32 SCurrent;
+	// Each voice has a buffer of decoded samples
+	s32 DecodeFifo[32];
+	u32 DecPosWrite;
+	u32 DecPosRead;

 	// it takes a few ticks for voices to start on the real SPU2?
 	void Start();
--- a/pcsx2/SPU2/spu2sys.cpp
+++ b/pcsx2/SPU2/spu2sys.cpp
@ -181,7 +181,6 @@ void V_Core::Init(int index)
 		VoiceGates[v].WetR = -1;

 		Voices[v].Volume = V_VolumeSlideLR(0, 0); // V_VolumeSlideLR::Max;
-		Voices[v].SCurrent = 28;

 		Voices[v].ADSR.Counter = 0;
 		Voices[v].ADSR.Value = 0;
@ -190,6 +189,10 @@ void V_Core::Init(int index)
 		Voices[v].NextA = 0x2801;
 		Voices[v].StartA = 0x2800;
 		Voices[v].LoopStartA = 0x2800;
+
+		memset(Voices[v].DecodeFifo, 0, sizeof(Voices[v].DecodeFifo));
+		Voices[v].DecPosRead = 0;
+		Voices[v].DecPosWrite = 0;
 	}

 	DMAICounter = 0;
@ -212,22 +215,18 @@ void V_Voice::Start()
 	}

 	ADSR.Attack();
-	SCurrent = 28;
 	LoopMode = 0;

-	// When SP >= 0 the next sample will be grabbed, we don't want this to happen
-	// instantly because in the case of pitch being 0 we want to delay getting
-	// the next block header. This is a hack to work around the fact that unlike
-	// the HW we don't update the block header on every cycle.
-	SP = -1;
+	SP = 0;

 	LoopFlags = 0;
 	NextA = StartA | 1;
 	Prev1 = 0;
 	Prev2 = 0;

-	PV1 = PV2 = 0;
-	PV3 = PV4 = 0;
+	SBuffer = nullptr;
+	DecPosRead = 0;
+	DecPosWrite = 0;
 }

 void V_Voice::Stop()
@ -1014,12 +1013,10 @@ static void RegWrite_VoiceAddr(u16 value)
 			// Wallace And Gromit: Curse Of The Were-Rabbit.

 			thisvoice.NextA = ((u32)(value & 0x0F) << 16) | (thisvoice.NextA & 0xFFF8) | 1;
-			thisvoice.SCurrent = 28;
 			break;

 		case 5:
 			thisvoice.NextA = (thisvoice.NextA & 0x0F0000) | (value & 0xFFF8) | 1;
-			thisvoice.SCurrent = 28;
 			break;
 	}
 }
@ -1237,7 +1234,6 @@ static void RegWrite_Core(u16 value)
 				for (uint v = 0; v < 24; ++v)
 				{
 					Cores[1].Voices[v].Volume = V_VolumeSlideLR(0, 0); // V_VolumeSlideLR::Max;
-					Cores[1].Voices[v].SCurrent = 28;

 					Cores[1].Voices[v].ADSR.Value = 0;
 					Cores[1].Voices[v].ADSR.Phase = 0;
--- a/pcsx2/SaveState.h
+++ b/pcsx2/SaveState.h
@ -25,7 +25,7 @@ enum class FreezeAction
 // [SAVEVERSION+]
 // This informs the auto updater that the users savestates will be invalidated.

-static const u32 g_SaveVersion = (0x9A56 << 16) | 0x0000;
+static const u32 g_SaveVersion = (0x9A57 << 16) | 0x0000;


 // the freezing data between submodules and core