GAMMALab
/
OpenHGNN

 
			
							import dgl
import dgl.function as fn
import numpy as np
import torch as th
import scipy.sparse as ssp
import array
import torch


class Budget(object):
    def __init__(self, hg, n_types, NS):
        self.n_types = {}
        for key, value in n_types.items():
            self.n_types[key] = th.zeros(value)
        self.NS = NS
        self.hg = hg
    def update(self, dst_type, idxs):
        for etype in self.hg.canonical_etypes:
            if dst_type == etype[2]:
                src_type = etype[0]
                #degree = self.hg.in_degrees(idx, etype=etype)
                for i in idxs:
                    src_idx = self.hg.predecessors(i, etype=etype)
                    #if src_idx.shape[0] > 0:
                    len = src_idx.shape[0]
                    if src_type in self.NS.keys():
                        src_idx = th.tensor([i for i in src_idx if i not in self.NS[src_type]])
                    if src_idx.shape[0] > 0:
                        self.n_types[src_type][src_idx] += 1 / len

    def pop(self, type, idx):
        self.n_types[type][idx] = 0


class HGTsampler(object):
    def __init__(self, hg, category, num_nodes_per_type, num_steps):
        self.n_types = {}
        for n in hg.ntypes:
            self.n_types[n] = hg.num_nodes(n)
        self.category = category
        self.num_nodes_per_type = num_nodes_per_type
        self.num_steps = num_steps
        self.hg = hg

    def sampler_subgraph(self, seed_nodes):
        OS = {self.category: th.stack(seed_nodes)}
        NS = OS
        B = Budget(self.hg, self.n_types, NS)
        for type, idxs in OS.items():
            B.update(type, idxs)
        for i in range(self.num_steps):
            prob = {}
            for src_type, p in B.n_types.items():
                #print(src_type)
                if p.max() > 0:
                    prob[src_type] = p / th.sum(p)
                    sampled_idx = th.multinomial(prob[src_type], self.num_nodes_per_type, replacement=False)
                    if not OS.__contains__(src_type):
                        OS[src_type] = sampled_idx
                    else:
                        OS[src_type] = th.cat((OS[src_type], sampled_idx))
                    B.update(src_type, sampled_idx)
                    B.pop(src_type, sampled_idx)
        sg = self.hg.subgraph(OS)
        return sg, OS


def HGT_preprocess4mag(hg, train_idx):
    hg = hg.to('cpu')
    edges = {etype: hg.edges(etype=etype) for etype in hg.canonical_etypes}
    edges.update({(v, e + '_inv', u): (dst, src) for (u, e, v), (src, dst) in edges.items()})
    hg2 = dgl.heterograph(edges)
    hg2 = dgl.to_simple(hg2)

    # Initialize year
    hg2.nodes['paper'].data['timestamp'] = hg.nodes['paper'].data['year'].squeeze()
    for ntype in hg.ntypes:
        if ntype != 'paper':
            hg2.nodes[ntype].data['timestamp'] = th.zeros(hg2.num_nodes(ntype), dtype=th.int64)

    # Aggregate bag-of-paper features
    hg2.nodes['paper'].data['feat'] = hg.nodes['paper'].data['feat']
    hg2.update_all(fn.copy_u('feat', 'm'), fn.mean('m', 'feat'), etype='has_topic')  # field_of_study
    hg2.update_all(fn.copy_u('feat', 'm'), fn.mean('m', 'feat'), etype='writes_inv')  # author
    hg2.update_all(fn.copy_u('feat', 'm'), fn.mean('m', 'feat'), etype='affiliated_with')  # institution

    # Attach log-degree to feature of each node type
    for ntype in hg2.ntypes:
        hg2.nodes[ntype].data['deg'] = th.zeros(hg2.num_nodes(ntype))
    for utype, etype, vtype in hg2.canonical_etypes:
        hg2.nodes[vtype].data['deg'] += hg2.in_degrees(etype=etype)
    for ntype in hg2.ntypes:
        hg2.nodes[ntype].data['feat'] = th.cat([
            hg2.nodes[ntype].data['feat'],
            th.log10(hg2.nodes[ntype].data['deg'][:, None])], 1)
        del hg2.nodes[ntype].data['deg']

    for ntype in hg2.ntypes:
        hg2.nodes[ntype].data['train_mask'] = torch.zeros(hg2.num_nodes(ntype), dtype=torch.bool)
        if ntype == 'paper':
            hg2.nodes[ntype].data['train_mask'][train_idx['paper']] = True

    # Convert to homogeneous graph and add self-loop
    g = dgl.to_homogeneous(hg2, ndata=['timestamp', 'feat'])
    g.edata['etype'] = g.edata[dgl.ETYPE]
    g.ndata['ntype'] = g.ndata[dgl.NTYPE]
    g.ndata['nid'] = g.ndata[dgl.NID]
    del g.edata[dgl.ETYPE]
    del g.edata[dgl.EID]
    del g.ndata[dgl.NTYPE]
    del g.ndata[dgl.NID]
    num_nodes = g.num_nodes()
    g = dgl.add_self_loop(g)
    g.edata['etype'][-num_nodes:] = len(hg2.etypes)

    return g