Blog / Home
About
Media Gallery

Welcome
to
Thronic.com

ժʝ_

The Internals of a Game Trainer


This goes through the steps of how some of my external game trainers work. The way CE uses unconditional jump hooking in its AOB lua script injection scheme to do its hacks makes this method (jump hooking) pretty much the equivalent when moving it over into your own c/c++ based program. The code illustrated here is for manipulating existing assembly code in a game or program, so creating code caves for your own code makes sense. If you simply need to change a value directly or NOP something, you can just WPM it directly.

This note only covers key parts of a trainer, and not other parts like window, UI etc. that's not directly related. I've usually done transparent windows with just a simple double buffered GDI UI. Another option is to wrap it in a DLL and use C# as UI, or just do a console program. Exception and error handling is also not in focus and up to you to take care of. In that manner, this note requires somewhat pre-existing code experience and meant as inspiration for key parts, not a spoon-fed recipy for a full trainer. I can mention I've used WM_NCHITTEST for dragging the window and simply PlaySound() for user feedback (activated/success/etc) in addition to text status on the UI via the GDI buffer.


Step 0, Preparing Process Rights (RPM/WPM/Alloc) & Information (Handles, Addresses)

I've wrapped prep code in a class named hC1 that prepares the trainer to have debugging rights (only really needed for system processes) and to get a handle to the process and base addresses to any modules that may be needed (I just make extra calls from PrepProcess), like GameClient.dll or similar. Opening the process with the following rights also prepares it to allow allocating memory inside it, as well as RPM/WPM. Used simply by creating an object and calling the PrepProcess() function (after adapting it for a specific trainer).

class hC1
{
	public:
	SYSTEM_INFO SysInfo;
	HWND __HWNDCss;
	HANDLE __HandleProcess;
	PROCESSENTRY32W __GameProcess;
	uintptr_t ProcBasePtr = 0x0;
	uintptr_t ProcEndPtr = 0x0;

	//
	//	Funksjon for å hente PID til målprosess.
	//
	private:
	bool FindProcessName(
		const wchar_t *__ProcessName,
		PROCESSENTRY32W *pEntry
	) {
		PROCESSENTRY32W __ProcessEntry = { 0 };
		__ProcessEntry.dwSize = sizeof(PROCESSENTRY32W);
		HANDLE hSnapshot = CreateToolhelp32Snapshot(
			TH32CS_SNAPPROCESS,
			0
		);

		if (hSnapshot == INVALID_HANDLE_VALUE)
			return false;

		if (!Process32FirstW(hSnapshot, &__ProcessEntry)) {
			CloseHandle(hSnapshot);
			return false;
		}

		do {
			if (!_wcsicmp(__ProcessEntry.szExeFile, __ProcessName))
			{
				memcpy(
					(void *)pEntry,
					(void *)&__ProcessEntry,
					sizeof(PROCESSENTRY32W)
				);
				CloseHandle(hSnapshot);
				return true;
			}

		} while (Process32NextW(hSnapshot, &__ProcessEntry));

		CloseHandle(hSnapshot);
		return false;
	}

	//
	//	Funksjon for debug-privilegier for oss selv. 
	//	Gir optimale tilgangsrettigheter mot målprosess.
	//
	void runSetDebugPrivs()
	{
		HANDLE __HandleProcess = GetCurrentProcess(), __HandleToken;
		TOKEN_PRIVILEGES priv;
		LUID __LUID;

		OpenProcessToken(
			__HandleProcess,
			TOKEN_ADJUST_PRIVILEGES,
			&__HandleToken
		);

		LookupPrivilegeValue(0, "SeDebugPrivilege", &__LUID);
		priv.PrivilegeCount = 1;
		priv.Privileges[0].Luid = __LUID;
		priv.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
		AdjustTokenPrivileges(__HandleToken, false, &priv, 0, 0, 0);

		CloseHandle(__HandleToken);
		CloseHandle(__HandleProcess);
	}

	//
	//	Funksjon for henting av minnepekere til moduler.
	//	Brukes som entrypoints for offsets vi har funnet.
	//
	uintptr_t GetModuleNamePointer(
		const wchar_t* LPSTRModuleName,
		DWORD __DwordProcessId
	) {
		MODULEENTRY32W lpModuleEntry = { 0 };
		lpModuleEntry.dwSize = sizeof(MODULEENTRY32W);
		HANDLE hSnapShot = CreateToolhelp32Snapshot(
			TH32CS_SNAPMODULE,
			__DwordProcessId
		);

		if (!hSnapShot)
			return 0x0;

		lpModuleEntry.dwSize = sizeof(lpModuleEntry);
		BOOL __RunModule = Module32FirstW(hSnapShot, &lpModuleEntry);

		while (__RunModule)
		{
			if (!_wcsicmp(lpModuleEntry.szModule, LPSTRModuleName))
			{
				// Regn ut sluttadresse som søk kan stoppe på.
				ProcEndPtr = (uintptr_t)lpModuleEntry.modBaseAddr +
					(uintptr_t)lpModuleEntry.modBaseSize;

				return (uintptr_t)lpModuleEntry.modBaseAddr;
			}
			__RunModule = Module32NextW(hSnapShot, &lpModuleEntry);
		}

		CloseHandle(hSnapShot);
		return 0x0;
	}

	//
	//	Funksjon for klargjøring av prosess og tilhørende moduler.
	//	Kjøres en enkelt gang under oppstart og kaster evt. et unntak 
	//	for videre feilhåndtering og avslutting av programmet.
	//
	public:
	void PrepProcess()
	{
		runSetDebugPrivs();

		// Vi skal søke i hele det tilgjengelige minneområdet. 
		GetSystemInfo(&SysInfo);

		// Prosessinfo.
		if(!FindProcessName(
				L"Game.exe",
				&__GameProcess
		))
			throw 1;

		// HANDLE til hovedprosess.
		__HandleProcess = OpenProcess(
			PROCESS_VM_READ |
			PROCESS_VM_WRITE |
			PROCESS_VM_OPERATION |
			PROCESS_QUERY_INFORMATION,
			false,
			__GameProcess.th32ProcessID
		);

		// Peker til baseadresse til hovedprosess.
		if((ProcBasePtr = GetModuleNamePointer(
				L"Game.exe",
				__GameProcess.th32ProcessID
		)) == 0x0)
			throw 1;
	}
};


Step 1, Get The Bytes

I first use Cheat Engine's disassemble, debug and dissect functionality and/or other debuggers to look at what code I want to patch and then create an AOB auto assembly script (based right off the default template) to generate and test the bytes (both search pattern and shellcode bytes) I need to copy for my trainer. When I have that I go to this website to quickly format (disassemble) into strings and arrays to use in my trainer.


Step 2, Prepare Bytes & Addresses

I set up 2 byte arrays per patch, one for the injection point and one for the shellcode.
//
//	Max HP bytes.
//
unsigned char HpCaveBytes[47] = { // Shellcode.
	0xF2, 0x0F, 0x58, 0xC1, 0xF2, 0x0F, 0x5A, 0xE8, 0x50, 0x48,
	0xB8, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF3,
	0x48, 0x0F, 0x2A, 0xE8, 0x58, 0xF3, 0x0F, 0x11, 0xAE, 0x50, 
	0x02, 0x00, 0x00,
	0xFF, 0x25, 0x00, 0x00, 0x00, 0x00, // Return jump.
	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
};
unsigned char HpEntryBytes[16] = {
	0xFF, 0x25, 0x00, 0x00, 0x00, 0x00, 
	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	0x90, 0x90
};

And a quick helper function to use this information if a user pushes F1 (from a trainer main loop).
//
//	F1
//
if (GetAsyncKeyState(VK_F1) & 0x8000) {
	MainLoopInjector(
		"Max HP",	// A short description for UI.
		1,		// What arrays to use.
		2,		// How many NOPs needed for entry jump.
		39,		// Where in the shellcode to inject return jump.
		&HpActive,	// Just an active bool test upon successful injection.
		"\xF2\x0F\x58\xC1\xF2\x0F\x5A\xE8\xF3\x0F\x11"	// Search pattern.
		"\xAE\x50\x02\x00\x00",
		"xxxxxxxxxxxxxxxx"				// Mask.
	);
}

Since this is taken from a 64-bit trainer, I need to account for 8 byte address space. That's why I place the return address at index 39, right after FF 25 RIP+0 in the shellcode. If this was 32-bit, I'd either have a 68 00 00 00 00 C3 6-byte push(68) and return(C3) that pops and uses absolute offset, or an E9 00 00 00 00 5-byte relative to next instruction jmp. I'll include some examples of methods I've used so far for better insight and future reference. You can create jumps by messing around with EIP/RIP (32/64 instruction pointers) positioning as well, but I like just using addresses.


Assembly jump examples

(TargetPtr = Addr from memory byte pattern scan)

Using 68+C3 (PUSH+Absolute RET, 6 byte requirement, used for 32-bit/max 4-byte address use)
// Return address in cavebytes.
*(DWORD*)&CaveBytes[ReturnAddrLocInCaveBytes] = (DWORD)(
	TargetPtr + (5 + NopCount)
);

Using E9 (Relative Destination - Current Location JMP, 5 byte requirement, used for 32-bit/max 4-byte address use)
// Jump to codecave/shellcode.
*(DWORD*)&EntryBytes[1] = (DWORD)( // 1 = index right after 0xE9/JMP
	CaveAddr - TargetPtr - 5
);

Using FF 25 (Absolute JMP, 14 byte requirement, used for 64-bit/max 8-byte address use)
// Return address in cavebytes.
*(DWORD64*)&CaveBytes[ReturnAddrLocInCaveBytes] = (DWORD64)(
	TargetPtr + (14 + NopCount)	// For the jump to cave just use CaveAddr.
	
	NOTE: Even if using e.g. mov rax,addr|jmp rax (48 B8 [0000000000000000] FF E0) I would 
	      still use at least 12 bytes as well as take up a register. I ended up preferring 
	      FF 25 when enough room as rax was often used. It's explicit, direct, clear and 
	      easy to use.
);
Note that mentioning 64/32-bit here doesn't directly have anything to do with compiled executable version, it's just memory space in regards to jump range.

The only practical difference is pretty much the address to the cave since relativity is to the base pointer to get back. The byte requirement is essential only in terms of how big the code you want to replace is. If the jump itself takes 14 bytes and you only want to replace e.g. 4, then you need to take the additional code with you into the "codecave" to run it there instead and NOP the margin to the return location. Usually not an issue unless there are dynamic values being used, but it's usually registry pointers. Make sure you don't use these in the shellcode segment, or at least restore their values. But shouldn't be necessary.

The remaining parts of the Trainer is pretty much just memory scanning and injecting. I scan the entire committed process memory space since a lot of game code can sometimes be JIT (created Just-In-Time), usually takes only a few seconds in 64-bit (my experience with DDR3 1600MHz, modern DDR4 architecture is at least 30% faster). In cases where you can't spare a few seconds wait time, just split the work into threads. I've done this when I wanted to scan all the injection points at once.

Quick note on Endianness
When messing around in memory you'll see bytes reversed in hex dumps, showing the least significant byte first due to computer memory on most modern computers operating in little-endian mode. You as the human will need to understand that e.g. E9 24 AE 0B F6 really means 0xF60BAE24 (‭in this case jumping -167006684‬ bytes backwards) which is how we deal with values normally in decreasing significance from left to right.

Making 5 byte jumps even in 64-bit memory space
Hook space is often sparse, and to avoid having to do "massive" 14-byte jumps vs only 5-byte E9 jumps, you can loop the memory allocation close to the address where you are jumping from to try and get within a 32-bit (4GB) memory distance, which is almost guaranteed for just a single or a few pages (typically 1000 bytes each) of memory. But I still sometimes opt for a 14-byte jump if it's not too unpractical.
// Loop memory space forward from hook point to 
// try and find cave space as close as possible.

uintptr_t memPageStep = 0;
while ((CaveAddr = (uintptr_t)VirtualAllocEx(
	hProc.__HandleProcess,
	(void*)(TargetPtr + memPageStep),
	1000,
	MEM_COMMIT | MEM_RESERVE,
	PAGE_EXECUTE_READWRITE
)) == NULL)
	memPageStep += 1000;



Conditional jumps

A quick additional note on conditional jumps. Sometimes when I e.g. replace a E9 jump with a larger FF 25 jump, the conditional jumps may skew. These are relative and only need the number of bytes you want to jump as a byte hex value. Doing this in bytes will translate into the opcodes needed in assembly and is easy to work with. Below are some examples of some of the most common ones.

Short 2-byte jumps that will jump 10 bytes ahead from the end of their position (last byte is distance):
77 0A = JA
73 0A = JAE
72 0A = JB
76 0A = JBE
74 0A = JE
7F 0A = JG
7D 0A = JGE
7C 0A = JL
7E 0A = JLE

Near 6-byte jumps that will jump 10 bytes ahead from the end of their position (last 4 bytes is distance, a DWORD/UINT):
0F 87 0A 00 00 00 = JA
0F 83 0A 00 00 00 = JAE
0F 82 0A 00 00 00 = JB
0F 86 0A 00 00 00 = JBE
0F 84 0A 00 00 00 = JE
0F 8F 0A 00 00 00 = JG
0F 8D 0A 00 00 00 = JGE
0F 8C 0A 00 00 00 = JL
0F 8E 0A 00 00 00 = JLE

So, to jump 10 bytes = *(write*)0xA as a DWORD = 0A 00 00 00 or single byte = 0A, which will translate to wanted opcode address.


Step 3, Memory Scanning

Updated late April 2020, multi-threading.
//
//	Klasse for å gjennomsøke tilgjengelig og åpent minne i en målprosess.
//
class BytePatternScanner
{
	private:
	//
	//	Hjelpefunksjoner for gjennomsøk av bulker med minne.
	//
	static uintptr_t FindAddressOfByteArray(
		const unsigned char* Bytes,
		const char* Mask,
		unsigned char* Buf,
		SIZE_T BufSize
	) {
		SIZE_T MaskLen = strlen(Mask);
		bool Found = false;

		// Søk gjennom buffer.
		for (unsigned int i = 0; i < BufSize - MaskLen; i++) {

			// Sjekk etter mønster, med hensyn til maske.
			Found = true;
			for (unsigned int j = 0; j < MaskLen; j++) {

				// Ta høyde for maske/universaltegn.
				if (
					Mask[j] != '?' && Bytes[j] != Buf[i + j]
				) {
					Found = false;
					break;
				}
			}

			// Returner offset i Buf hvis møster ble funnet.
			if (Found)
				return i;
		}

		return 0;
	}


	//
	//	Struktur for MemoryRegister i ScanMemory().
	//
	struct ProcMemRegion {
		uintptr_t AddrToReadFrom = 0;
		SIZE_T NumBytesToRead;
	};


	public:
	//
	//	Hjelpefunksjon for å lese minneregister opprettet ved hjelp av 
	//	VirtualQueryEx som har sjekket prosessminne som er tilgjengelig.
	//
	static void ReadMemoryRegister(
		const unsigned char* Pattern,
		const char* Mask,
		std::vector<ProcMemRegion> MemoryRegister,
		std::size_t IndexStart,
		std::size_t IndexStop
	){
		uintptr_t ByteOffsetFound;	// Mønsterposisjon ved funn.
		unsigned char* ReadBuf;		// RPM mellomlager.
		SIZE_T BytesRead = 0;		// RPM lesestatus.

		for (std::size_t n=IndexStart; n<=IndexStop; n++) {

			// Klargjør buffer.
			ReadBuf = new unsigned char[MemoryRegister[n].NumBytesToRead];

			// Les minneområde.
			if (ReadProcessMemory(
				hProc.__HandleProcess,
				(const void*)MemoryRegister[n].AddrToReadFrom,
				ReadBuf,
				MemoryRegister[n].NumBytesToRead,
				&BytesRead
			) == 0) {
				// Feil ved lesing, fortsett til neste.
				continue;
			}

			// Gjør et mønstersøk.
			ByteOffsetFound = FindAddressOfByteArray(
				Pattern, // Det vi vil finne. 
				Mask, // Maske med wildcard.
				ReadBuf, // Søk i innlest buffer.
				BytesRead // Bruk kun lest lengde.
			);

			// Regn ut minne offset.
			if (ByteOffsetFound != 0) {
				TargetPtrGlobal = MemoryRegister[n].AddrToReadFrom + ByteOffsetFound;
				break;
			}

			// Sjekk om noen andre tråder har funnet målet allerede.
			if (TargetPtrGlobal != 0)
				break;
			
			// Frigjør mellomlager.
			delete[] ReadBuf;
		}
	}


	//
	//	Hovedfunksjon for gjennomsøk av minne.
	//
	uintptr_t ScanMemory(
		const unsigned char* Pattern,
		const char* Mask
	) {
		// Funksjonsvariabler.
		uintptr_t TargetPtr = 0;	// Måladresse;
		uintptr_t MemPos = 0;		// Iterator for VirtualQueryEx.
		MEMORY_BASIC_INFORMATION mbi;	// Infostruktur for VirtualQueryEx.
		uintptr_t StartAddr = (uintptr_t)hProc.SysInfo.lpMinimumApplicationAddress;
		uintptr_t EndAddr = (uintptr_t)hProc.SysInfo.lpMaximumApplicationAddress;
		std::vector<ProcMemRegion> MemoryRegister; // Minne funnet via VirtualQueryEx.


		//
		//	Søk gjennom minne og registrer tilgjengelige prosessområder.
		//
		while ((StartAddr + MemPos + strlen(Mask)) <= EndAddr) {

			//
			//	Test neste minneside i modulen via VirtualQueryEx og MBI.
			//	MBI vil inneholde informasjon om minnet relatert til prosess.
			//
			if (VirtualQueryEx(
				hProc.__HandleProcess,
				(LPCVOID)(StartAddr + MemPos),
				&mbi,
				sizeof(mbi)
			) == 0) {
				MemPos += mbi.RegionSize;
				continue;
			}

			//
			//	Registrer i minneregisteret vårt 
			//	hvis minneområdet er tilgjengelig. 
			//
			if (
				mbi.State == MEM_COMMIT &&
				!(mbi.Protect & (PAGE_NOACCESS | PAGE_GUARD))
			) {
				// Legg til i register.
				MemoryRegister.push_back({
					StartAddr + MemPos,
					mbi.RegionSize
				});
			}

			// Flytt iterator til neste minneområde.
			MemPos += mbi.RegionSize;
		}


		//
		//	Les alle tilgjengelige områder og søk etter mønster.
		//

		// Arbeid hver tråd skal utføre.
		std::size_t IndexStep = (MemoryRegister.size()-1) / 4;

		// Start tråder.
		std::thread t1 (ReadMemoryRegister, Pattern, Mask, MemoryRegister, 0, IndexStep);
		std::thread t2 (ReadMemoryRegister, Pattern, Mask, MemoryRegister, IndexStep+1, IndexStep*2);
		std::thread t3 (ReadMemoryRegister, Pattern, Mask, MemoryRegister, (IndexStep*2)+1, IndexStep*3);
		std::thread t4 (ReadMemoryRegister, Pattern, Mask, MemoryRegister, (IndexStep*3)+1, MemoryRegister.size()-1);

		// Vent på tråd(er).
		t1.join();
		t2.join();
		t3.join();
		t4.join();

		// Oppdater måladresse.
		TargetPtr = TargetPtrGlobal;
		TargetPtrGlobal = 0;

		// Returner måladresse.
		return TargetPtr;
	}
};

The injection step is just allocating a cave and using the address the memory scanner returns to WPM the final byte arrays once the addresses are inserted into them, as seen above where I use a DWORD (4 byte) and DWORD64 (8 byte) pointers to the target index address of the byte arrays and apply the respective amount of bytes into them, instead of using loops and extra code. You will rarely see multiple indexes being inserted like this in beginner C/C++ tutorials (leet'ish people will use variations of them, different type casts etc, without explanation on YT vids and forums). Simply put, since pointer arithmetic and array indexing are equivalent, this works.


Step 4, Injecting Entrypoint and Codecave

//
//	Funksjon som tar seg av selve injisering og kodehule.  
//
bool InjectBytesAt(
	uintptr_t TargetPtr, 
	int NopCount,
	int WhatTarget,
	int ReturnAddrLocInCaveBytes
){
	//
	//	For midl. execute/read/write beskyttelse ved RPM/WPM.
	//	Vi bruker kun ved behov, pga. økt deteksjonsfaktor.
	//
	//DWORD OldProtect;

	// Reserver litt minne i målprosess.
	uintptr_t CaveAddr = (uintptr_t)VirtualAllocEx(
		hProc.__HandleProcess,
		NULL,
		1024,	// Usually plenty of bytes for an individual cave.
		MEM_COMMIT | MEM_RESERVE,
		PAGE_EXECUTE_READWRITE
	);

	if (CaveAddr == 0)
		return false;

	// Hva skal vi injisere?
	unsigned char* CaveBytes;
	unsigned char* EntryBytes;
	SIZE_T EntryByteNum, CaveByteNum;
	switch (WhatTarget) {
		case 1:
			CaveBytes = HpCaveBytes;
			EntryBytes = HpEntryBytes;
			EntryByteNum = sizeof(HpEntryBytes);
			CaveByteNum = sizeof(HpCaveBytes);
			break;
		case 2:
			CaveBytes = EnergyCaveBytes;
			EntryBytes = EnergyEntryBytes;
			EntryByteNum = sizeof(EnergyEntryBytes);
			CaveByteNum = sizeof(EnergyCaveBytes);
			break;
		default:
			return false;
	}

	// Plasser returadresse i cavebytes som skal injiseres.
	*(DWORD64*)&CaveBytes[ReturnAddrLocInCaveBytes] = (DWORD64)(
		TargetPtr + 
		(14 + NopCount)
	);
	
	// Skriv til allokert minneområde.
	if (WriteProcessMemory(
		hProc.__HandleProcess,
		(void*)CaveAddr,
		CaveBytes,
		CaveByteNum,
		NULL
	) == 0) {
		return false;
	}

	*(DWORD64*)&EntryBytes[6] = (DWORD64)(
		CaveAddr
	);
	
	/*VirtualProtectEx(
		hProc.__HandleProcess, (void*)CaveAddr, 
		sizeof(CaveBytes), PAGE_EXECUTE_READWRITE, 
		&OldProtect
	);*/
	if (WriteProcessMemory(
		hProc.__HandleProcess,
		(void*)TargetPtr,
		EntryBytes,
		EntryByteNum,
		NULL
	) == 0) {
		return false;
	}
	/*VirtualProtectEx(
		hProc.__HandleProcess, (void*)CaveAddr, 
		sizeof(CaveBytes), OldProtect, 
		&OldProtect
	);*/

	return true;
}

I've commented VirtualProtectEx as it triggers protections and has been an obstacle more than help, but I've included it to demonstrate its potential usage. I've never actually had to use it yet, but I wanted to test it once when troubleshooting, and include it here for reference. It's only wrapped around the injection point, since that's not my own allocated memory and may have a different protection. But haven't really needed it yet.

I'd like to reiterate what I wrote in the beginning. This note is meant for inspiration, not a copy & paste project. If it provided reference, help and/or increased understanding for even only a small part, then I'm happy for writing it.


Stepwise experiences gained during the development of trainers


Original Post: Jan 27th, '22 19:12 CET.
Updated: Jan 27th, '22 19:13 CET.

C/C++
π