multi-gpu inference with one-batch big-graph #9539

heyfavour · 2024-07-24T11:13:18Z

heyfavour
Jul 24, 2024

i am a chemistry researcher, we use gnn to predict molecular energy and force , force must predict by $de/dx$

i have trained a model,but i want to speed up inference when simulate very many atoms.
i want to split atom in multi-gpu when message, and update x by torch.dist or pyg.dist.

next is my basic code

import torch
from torch import nn
from torch_geometric.nn.conv import MessagePassing

def radius_graph(data):
    edge_index = None
    return edge_index

class SimpleMP(MessagePassing):
    def __init__(self,channel):
        self.mlp = torch.nn.Linear(channel)
    def forward(self, edge_index,x,r):
        out = self.propagate(edge_index, x=x, R=r)
        x = self.mlp(out) + x
        return x
    def message(self,  x_j, R) -> Tensor:
        return x_j*R

class GNN(nn.Mo):
    def __init__(self):
        self.z_emb = torch.nn.Embedding(100,128)
        self.MP1 = SimpleMP(128)
        self.MP2 = SimpleMP(128)
        self.energy_layer = torch.nn.Linear(128,1)

    def foward(self,data):
        z,pos = data.z,data.pos
        pos.require_grad_(True)
        # z is atom like H:0 C:5 N:6 0:7
        # pos is atom postiion like [x,y,z]
    
        edge_index = radius_graph(data)# to get edge_index by pos
        j,i = edge_index
        r = (pos[j] - pos[i]).norm(dim=-1)
        x = self.z_emb(z)
        x = self.MP1(edge_index,x,r)
        x = self.MP2(edge_index,x,r)
        e = self.energy_layer(x)
        energy = torch.scatter(e)
        force = torch.autograd.grad(energy, [pos], grad_outputs=torch.ones_like(energy))
        return energy,force

can split x in multi-gpu by ddp(model) and torch.all_gather(x) after MP1 and MP2
it can get energy
but it will lose grad,and grad is needed to predict force ,i have no idea to continue,how to do inference.
suppose my cuda mem is enough, i just want to speed up first by split message calculation into multi-gpu

akihironitta · 2024-07-29T10:31:08Z

akihironitta
Jul 29, 2024
Maintainer

Just to make sure I understand what you're trying to do, is the script you provided us with for single-GPU? If not, can you share what it looks like when running on a single GPU?

it will lose grad

Can you clarify what it means?

1 reply

heyfavour Jul 29, 2024
Author

next is my simple code,i have try all_reduce to get grad,but when i use autograd.grad(x,pos) it will be ok when numlayers =1,,when num_layers >1 grad is close to single gpu,but not right.

import torch
import random
import numpy as np
import torch.distributed as dist
from torch import Tensor
from torch_scatter import scatter

from torch.autograd import grad
from torch_geometric.nn import radius
from torch_geometric.data import Data

from torch.nn.parallel import DistributedDataParallel as DDP


def partition_z(z, n, gpus, rank):
   n = z.shape[0]
   block_num = n // gpus
   start = rank * block_num
   if ((rank + 1) == gpus):
       return z[start:], start
   else:
       end = (rank + 1) * block_num
   return z[start:end], start


def set_seed(seed=1):
   random.seed(seed)
   np.random.seed(seed)
   torch.manual_seed(seed)
   torch.cuda.manual_seed(seed)
   torch.cuda.manual_seed_all(seed)


class CosineCutoff(torch.nn.Module):
   def __init__(self, cutoff: float):
       super(CosineCutoff, self).__init__()
       self.cutoff = cutoff

   def forward(self, dist: torch.Tensor):
       # 0.5*(cos(pi*r/cutoff)+1)
       soft_cutoff = 0.5 * (torch.cos(dist * torch.pi / self.cutoff) + 1.0)
       soft_cutoff = soft_cutoff * (dist < self.cutoff).float()
       return soft_cutoff


class Guass_Basis(torch.nn.Module):
   def __init__(self, num_rbf=40, node_dim=128, pair_dim=64, cutoff=7.0, num_layers=3):
       super().__init__()
       offset = torch.linspace(0, cutoff, num_rbf)
       self.num_rbf = num_rbf
       self.node_dim = node_dim
       self.cutoff = cutoff
       self.coeff = -0.5 / (offset[1] - offset[0]).item() ** 2  # 是一个具体的数值
       self.radical_layers = torch.nn.Sequential(
           torch.nn.Linear(num_rbf, node_dim),
           torch.nn.SiLU(),
           torch.nn.Linear(node_dim, pair_dim * num_layers),
       )
       self.register_buffer('offset', offset)

   def forward(self, dist):
       dist = dist.view(-1, 1) - self.offset.view(1, -1)
       gaussian_rbf = torch.exp(self.coeff * torch.pow(dist, 2))
       rbf = self.radical_layers(gaussian_rbf)
       return rbf


class SphConv(torch.nn.Module):

   def __init__(self, node_dim, pair_dim):
       super().__init__()
       self.lin_x_i = torch.nn.Linear(node_dim, pair_dim)
       self.lin_x_j = torch.nn.Linear(node_dim, pair_dim)

       self.lin_L0 = torch.nn.Linear(pair_dim, node_dim)

       self.node_layer = torch.nn.Sequential(
           torch.nn.Linear(node_dim, node_dim),
           torch.nn.LayerNorm(node_dim),
           torch.nn.SiLU(),
           torch.nn.Linear(node_dim, node_dim),
       )

   def forward(self, x, edge_index, rbf_r, factor_r) -> Tensor:
       j, i = edge_index
       x_i = self.lin_x_i(x)[i]
       x_j = self.lin_x_j(x)[j]
       #######################################################
       node = x_i * x_j * rbf_r * factor_r
       ######################################################
       node = self.lin_L0(scatter(node, i, dim=0))
       node = self.node_layer(node) + x
       return node


def radius_graph(pos_j, i_num, pos_i, offset, rank):
   """
   calculate neighbour by radius,different gpu only count data.pos[z_ddpidx]
   :param pos_j:
   :param i_num:
   :param pos_i:
   :param offset:
   :param rank:
   :return:
   """
   batch_i = torch.zeros(pos_i.shape[0]).long().to(rank)
   batch_j = torch.zeros([pos_j.shape[0]]).long().to(rank)
   edge_index = radius(pos_j, pos_i, r=5.0, batch_x=batch_j, batch_y=batch_i, max_num_neighbors=512)
   j, i = edge_index[1], edge_index[0]
   i = i + offset
   mask = i != j
   #############################################################################################real j i
   # j i 是对应的每个node的索引   _j 是pos_j的索引
   _j, i = j[mask], i[mask]
   j = torch.remainder(_j, i_num)
   edge_index = torch.stack([j, i], dim=0)
   return edge_index, _j


class PTSDGraphNet(torch.nn.Module):
   def __init__(self, cutoff=5.0, node_dim=512, num_layers=3, pair_dim=128, num_rbf=48):
       super().__init__()

       self.node_dim = node_dim
       self.pair_dim = pair_dim
       self.cutoff = cutoff  # 截断 5.0
       self.z_emb = torch.nn.Embedding(105, node_dim)
       self.rbf_function = Guass_Basis(num_rbf=num_rbf, node_dim=node_dim, pair_dim=pair_dim, cutoff=cutoff,
                                       num_layers=num_layers)
       self.cutoff_factor = CosineCutoff(cutoff)
       self.num_layers = num_layers
       ################################################################################################################
       self.s0layers = torch.nn.ModuleList()

       for i in range(self.num_layers):
           self.layers.append(SphConv(node_dim, pair_dim))

       self.energy_layer = torch.nn.Sequential(
           torch.nn.Linear(node_dim, 1),
       )

   def sync_x(self, dist, x, z_ddpidx, rank, i_num):
       """
       Synchronize x after message passing by all_reduce
       :param dist:
       :param x:
       :param z_ddpidx:
       :param rank:
       :param i_num:
       :return:
       """
       nx = torch.zeros(i_num, self.node_dim, dtype=torch.float64, device=rank)
       nx[z_ddpidx] = nx[z_ddpidx] + x
       dist.all_reduce(nx)
       return nx

   def forward(self, data, i_num, dist, rank, z_ddpidx, offset, partition) -> Tensor:
       """
       :param data: pyg Data
       :param i_num: moleculer  atom nums  like :12
       :param dist:torch.distributed
       :param rank:gpu rank
       :param z_ddpidx: atom's idx which  can execute gnn
       :param offset:atom's idx offset
       :param partition:gpu nums
       :return:energy force
       """
       # 子图拆分
       pos_j = data.pos_j
       pos_i = data.pos[z_ddpidx]
       edge_index, _j = radius_graph(pos_j, i_num, pos_i, offset, rank)  # get edge_index
       # init ===========================================================
       cell, image, frac_pos = data.cell, data.image, data.frac_pos
       strain = torch.zeros_like(cell, requires_grad=True, dtype=torch.float64).to(rank)
       lattice = cell @ (torch.eye(3, dtype=torch.float64).to(strain.device) + strain)
       shift = image @ lattice
       unfrac_pos = frac_pos @ lattice  # atom position shape [n 3] like [[x , y ,z]]
       j, i = edge_index
       ###############################################################球谐
       v_r = unfrac_pos[j] + shift[_j] - unfrac_pos[i]
       ##############################################################距离
       s_r = v_r.norm(dim=-1).view(-1, 1)
       rbf_r = self.rbf_function(s_r)
       rbf_r = torch.split(rbf_r, self.pair_dim, dim=-1)
       factor_r = self.cutoff_factor(s_r)
       x = self.z_emb(data.z - 1)
       for i in range(self.num_layers):
           x = self.s0layers[i](x, edge_index, rbf_r[i], factor_r)[z_ddpidx]
           x = self.sync_x(dist, x, z_ddpidx, rank, i_num)
           """
           Q/A
           notice
           when i == 0 :dx/dunfrac_pos is right 
           !!!!!!!!!!!!!!!!!!!!!!!!!!
           when i == 1 :dx/dunfrac_pos is close to single gpu but different
           !!!!!!!!!!!!!!!!!!!!!!!!!!
           """
       x = self.energy_layer(x)
       energy = torch.sum(x)
       if rank == 0: print(energy)  # energy is ok
       force, = grad(energy, [unfrac_pos], grad_outputs=torch.ones_like(energy))
       dist.all_reduce(force)  # force is not right but close
       if rank == 0: print(force)
       if rank == 0: return energy, force


if __name__ == '__main__':
   set_seed(99)
   dist.init_process_group("nccl")
   rank = int(dist.get_rank())
   torch.cuda.set_device(rank)
   ######################################################################data
   pos = torch.random(12, 3)
   z = torch.ones(12)
   cell = torch.random(3, 3)
   data = Data(
       z=z,
       pos=pos,
       cell=cell,
       image=image,  # for pbc
       shift=shift,  # for pbc
       frac_pos=frac_pos,  # 分数坐标
       cell_num=27,  # 扩胞次数
       i_num=z.shape[0],  # 胞内原子数
       j_num=j_num,  # 总原子数
       pos_j=image @ shift,
   )
   ######################################################################model
   model = PTSDGraphNet(cutoff=5.0, node_dim=8, num_layers=2, pair_dim=4, num_rbf=64)
   model = model.to(torch.float64)
   model = model.to(rank)
   ddp_model = DDP(model, device_ids=[rank], output_device=rank)
   #######################################################################分区
   partition = dist.get_world_size()
   z = data.z
   i_num = data.z.shape[0]
   z_idx = torch.arange(0, i_num)
   z_ddpidx, offset = partition_z(z_idx, i_num, partition, rank)
   ######################################################################forward
   ddp_model(data, i_num, dist, rank, z_ddpidx, offset, partition)

AegisIK · 2024-10-01T18:41:14Z

AegisIK
Oct 1, 2024

Was this resolved? I'm getting a similar problem where the gradients are slightly off when compared to a single GPU baseline

1 reply

akihironitta Oct 3, 2024
Maintainer

If you could provide a minimal script, data and env details to reproduce the behaivour, I (or someone else in the community) may be able to have a look :)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

multi-gpu inference with one-batch big-graph #9539

{{title}}

Replies: 2 comments 2 replies

{{title}}

{{title}}

{{title}}

{{title}}

Select a reply

multi-gpu inference with one-batch big-graph #9539

heyfavour Jul 24, 2024

Replies: 2 comments · 2 replies

akihironitta Jul 29, 2024 Maintainer

heyfavour Jul 29, 2024 Author

AegisIK Oct 1, 2024

akihironitta Oct 3, 2024 Maintainer

heyfavour
Jul 24, 2024

Replies: 2 comments 2 replies

akihironitta
Jul 29, 2024
Maintainer

heyfavour Jul 29, 2024
Author

AegisIK
Oct 1, 2024

akihironitta Oct 3, 2024
Maintainer