grape.embiggen

Module with models for graph and text embedding and their Keras Sequences.

View Source
"""Module with models for graph and text embedding and their Keras Sequences."""
from .embedders import (
    CBOW, GloVe, GraphCBOW, GraphGloVe, GraphSkipGram,
    SkipGram, TransE, TransH, TransR, SimplE, Siamese
)
from .node_prediction import GraphConvolutionalNeuralNetwork
from .sequences import Word2VecSequence
from .transformers import (CorpusTransformer, EdgeTransformer,
                           GraphTransformer, LinkPredictionTransformer,
                           NodeTransformer)
from .visualizations import GraphVisualization

__all__ = [
    "CBOW",
    "SkipGram",
    "GloVe",
    "GraphCBOW",
    "GraphSkipGram",
    "GraphGloVe",
    "Word2VecSequence",
    "NodeTransformer",
    "EdgeTransformer",
    "GraphTransformer",
    "CorpusTransformer",
    "LinkPredictionTransformer",
    "GraphVisualization",
    "TransE",
    "TransH",
    "TransR",
    "SimplE",
    "Siamese",
    "GraphConvolutionalNeuralNetwork"
]
#   class CBOW(grape.embiggen.embedders.embedder.Embedder):
View Source
class CBOW(Embedder):
    """CBOW model for sequence embedding.

    The CBOW model for graoh embedding receives a list of contexts and tries
    to predict the central word. The model makes use of an NCE loss layer
    during the training process to generate the negatives.
    """

    def __init__(
        self,
        window_size: int = 4,
        negative_samples: int = 10,
        use_gradient_centralization: bool = True,
        **kwargs: Dict
    ):
        """Create new sequence Embedder model.

        Parameters
        -------------------------------------------
        window_size: int = 4,
            Window size for the local context.
            On the borders the window size is trimmed.
        negative_samples: int = 10,
            The number of negative classes to randomly sample per batch.
            This single sample of negative classes is evaluated for each element in the batch.
        use_gradient_centralization: bool = True,
            Whether to wrap the provided optimizer into a normalized
            one that centralizes the gradient.
            It is automatically enabled if the current version of
            TensorFlow supports gradient transformers.
            More detail here: https://arxiv.org/pdf/2004.01461.pdf
        **kwargs: Dict,
            Additional kwargs to pass to parent constructor.
        """
        # TODO! Figure out a way to test for Zifian distribution in the
        # data used for the word2vec sampling! The values in the vocabulary
        # should have a decreasing node degree order!
        self._window_size = validate_window_size(window_size)
        self._negative_samples = negative_samples
        super().__init__(
            use_gradient_centralization=use_gradient_centralization,
            **kwargs
        )

    def _build_model(self) -> Model:
        """Return CBOW model."""
        # Creating the inputs layers

        # Create first the input with the central terms
        central_terms_input = Input(
            (1, ),
            dtype=tf.int32,
            name="CentralTermsInput"
        )

        # Then we create the input of the contextual terms
        contextual_terms_input = Input(
            (self._window_size*2, ),
            dtype=tf.int32,
            name="ContextualTermsInput"
        )

        # Creating the embedding layer for the contexts
        contextual_terms_embedding = Embedding(
            input_dim=self._vocabulary_size,
            output_dim=self._embedding_size,
            input_length=self._window_size*2,
            name=Embedder.TERMS_EMBEDDING_LAYER_NAME,
        )(contextual_terms_input)

        contextual_embedding = GlobalAveragePooling1D()(
            contextual_terms_embedding
        )

        # Adding layer that also executes the loss function
        sampled_softmax = SampledSoftmax(
            vocabulary_size=self._vocabulary_size,
            embedding_size=self._embedding_size,
            negative_samples=self._negative_samples,
        )((contextual_embedding, central_terms_input))

        # Creating the actual model
        model = Model(
            inputs=[contextual_terms_input, central_terms_input],
            outputs=sampled_softmax,
            name="CBOW"
        )
        return model

    def _compile_model(self) -> Model:
        """Compile model."""
        # No loss function is needed because it is already executed in
        # the Sampled Softmax loss layer.
        self._model.compile(
            optimizer=self._optimizer
        )

CBOW model for sequence embedding.

The CBOW model for graoh embedding receives a list of contexts and tries to predict the central word. The model makes use of an NCE loss layer during the training process to generate the negatives.

#   CBOW( window_size: int = 4, negative_samples: int = 10, use_gradient_centralization: bool = True, **kwargs: Dict )
View Source
    def __init__(
        self,
        window_size: int = 4,
        negative_samples: int = 10,
        use_gradient_centralization: bool = True,
        **kwargs: Dict
    ):
        """Create new sequence Embedder model.

        Parameters
        -------------------------------------------
        window_size: int = 4,
            Window size for the local context.
            On the borders the window size is trimmed.
        negative_samples: int = 10,
            The number of negative classes to randomly sample per batch.
            This single sample of negative classes is evaluated for each element in the batch.
        use_gradient_centralization: bool = True,
            Whether to wrap the provided optimizer into a normalized
            one that centralizes the gradient.
            It is automatically enabled if the current version of
            TensorFlow supports gradient transformers.
            More detail here: https://arxiv.org/pdf/2004.01461.pdf
        **kwargs: Dict,
            Additional kwargs to pass to parent constructor.
        """
        # TODO! Figure out a way to test for Zifian distribution in the
        # data used for the word2vec sampling! The values in the vocabulary
        # should have a decreasing node degree order!
        self._window_size = validate_window_size(window_size)
        self._negative_samples = negative_samples
        super().__init__(
            use_gradient_centralization=use_gradient_centralization,
            **kwargs
        )

Create new sequence Embedder model.

Parameters
  • window_size (int = 4,): Window size for the local context. On the borders the window size is trimmed.
  • negative_samples (int = 10,): The number of negative classes to randomly sample per batch. This single sample of negative classes is evaluated for each element in the batch.
  • use_gradient_centralization (bool = True,): Whether to wrap the provided optimizer into a normalized one that centralizes the gradient. It is automatically enabled if the current version of TensorFlow supports gradient transformers. More detail here: https://arxiv.org/pdf/2004.01461.pdf
  • **kwargs (Dict,): Additional kwargs to pass to parent constructor.
Inherited Members
grape.embiggen.embedders.embedder.Embedder
TERMS_EMBEDDING_LAYER_NAME
trainable
summary
get_layer_weights
embedding
get_embedding_dataframe
save_embedding
name
save_weights
load_weights
fit
#   class SkipGram(grape.embiggen.embedders.embedder.Embedder):
View Source
class SkipGram(Embedder):
    """SkipGram model for sequence embedding.

    The SkipGram model for graoh embedding receives a central word and tries
    to predict its contexts. The model makes use of an NCE loss layer
    during the training process to generate the negatives.
    """

    def __init__(
        self,
        window_size: int = 4,
        negative_samples: int = 10,
        use_gradient_centralization: bool = True,
        **kwargs: Dict
    ):
        """Create new sequence Embedder model.

        Parameters
        -------------------------------------------
        window_size: int = 4,
            Window size for the local context.
            On the borders the window size is trimmed.
        negative_samples: int = 10,
            The number of negative classes to randomly sample per batch.
            This single sample of negative classes is evaluated for each element in the batch.
        use_gradient_centralization: bool = True,
            Whether to wrap the provided optimizer into a normalized
            one that centralizes the gradient.
            It is automatically enabled if the current version of
            TensorFlow supports gradient transformers.
            More detail here: https://arxiv.org/pdf/2004.01461.pdf
        **kwargs: Dict,
            Additional kwargs to pass to parent constructor.
        """
        # TODO! Figure out a way to test for Zifian distribution in the
        # data used for the word2vec sampling! The values in the vocabulary
        # should have a decreasing node degree order!
        self._window_size = validate_window_size(window_size)
        self._negative_samples = negative_samples
        super().__init__(**kwargs)

    def _build_model(self) -> Model:
        """Return SkipGram model."""
        # Creating the inputs layers

        # Create first the input with the central terms
        central_terms_input = Input(
            (1, ),
            dtype=tf.int32,
            name="CentralTermsInput"
        )

        # Then we create the input of the contextual terms
        contextual_terms_input = Input(
            (self._window_size*2, ),
            dtype=tf.int32,
            name="ContextualTermsInput"
        )

        # Creating the embedding layer for the contexts
        central_terms_embedding = Embedding(
            input_dim=self._vocabulary_size,
            output_dim=self._embedding_size,
            input_length=1,
            name=Embedder.TERMS_EMBEDDING_LAYER_NAME,
        )(central_terms_input)

        central_embedding = Flatten()(
            central_terms_embedding
        )

        # Adding layer that also executes the loss function
        output = NoiseContrastiveEstimation(
            vocabulary_size=self._vocabulary_size,
            embedding_size=self._embedding_size,
            negative_samples=self._negative_samples,
            positive_samples=self._window_size*2
        )((central_embedding, contextual_terms_input))

        # Creating the actual model
        model = Model(
            inputs=[contextual_terms_input, central_terms_input],
            outputs=output,
            name="SkipGram"
        )
        return model

    def _compile_model(self) -> Model:
        """Compile model."""
        # No loss function is needed because it is already executed in
        # the Noise Contrastive Estimation loss layer.
        self._model.compile(
            optimizer=self._optimizer
        )

SkipGram model for sequence embedding.

The SkipGram model for graoh embedding receives a central word and tries to predict its contexts. The model makes use of an NCE loss layer during the training process to generate the negatives.

#   SkipGram( window_size: int = 4, negative_samples: int = 10, use_gradient_centralization: bool = True, **kwargs: Dict )
View Source
    def __init__(
        self,
        window_size: int = 4,
        negative_samples: int = 10,
        use_gradient_centralization: bool = True,
        **kwargs: Dict
    ):
        """Create new sequence Embedder model.

        Parameters
        -------------------------------------------
        window_size: int = 4,
            Window size for the local context.
            On the borders the window size is trimmed.
        negative_samples: int = 10,
            The number of negative classes to randomly sample per batch.
            This single sample of negative classes is evaluated for each element in the batch.
        use_gradient_centralization: bool = True,
            Whether to wrap the provided optimizer into a normalized
            one that centralizes the gradient.
            It is automatically enabled if the current version of
            TensorFlow supports gradient transformers.
            More detail here: https://arxiv.org/pdf/2004.01461.pdf
        **kwargs: Dict,
            Additional kwargs to pass to parent constructor.
        """
        # TODO! Figure out a way to test for Zifian distribution in the
        # data used for the word2vec sampling! The values in the vocabulary
        # should have a decreasing node degree order!
        self._window_size = validate_window_size(window_size)
        self._negative_samples = negative_samples
        super().__init__(**kwargs)

Create new sequence Embedder model.

Parameters
  • window_size (int = 4,): Window size for the local context. On the borders the window size is trimmed.
  • negative_samples (int = 10,): The number of negative classes to randomly sample per batch. This single sample of negative classes is evaluated for each element in the batch.
  • use_gradient_centralization (bool = True,): Whether to wrap the provided optimizer into a normalized one that centralizes the gradient. It is automatically enabled if the current version of TensorFlow supports gradient transformers. More detail here: https://arxiv.org/pdf/2004.01461.pdf
  • **kwargs (Dict,): Additional kwargs to pass to parent constructor.
Inherited Members
grape.embiggen.embedders.embedder.Embedder
TERMS_EMBEDDING_LAYER_NAME
trainable
summary
get_layer_weights
embedding
get_embedding_dataframe
save_embedding
name
save_weights
load_weights
fit
#   class GloVe(grape.embiggen.embedders.embedder.Embedder):
View Source
class GloVe(Embedder):
    """GloVe model for graph and words embedding.

    The GloVe model for graph embedding receives two words and is asked to
    predict its cooccurrence probability.
    """

    def __init__(
        self,
        vocabulary_size: int,
        embedding_size: int,
        embedding: Union[np.ndarray, pd.DataFrame] = None,
        extra_features: Union[np.ndarray, pd.DataFrame] = None,
        optimizer: Union[str, Optimizer] = None,
        alpha: float = 0.75,
        random_state: int = 42,
        directed: bool = False,
        use_gradient_centralization: bool = True,
    ):
        """Create new GloVe-based Embedder object.

        Parameters
        -------------------------------
        vocabulary_size: int,
            Number of terms to embed.
            In a graph this is the number of nodes, while in a text is the
            number of the unique words.
        embedding_size: int,
            Dimension of the embedding.
        embedding: Union[np.ndarray, pd.DataFrame] = None,
            The seed embedding to be used.
            Note that it is not possible to provide at once both
            the embedding and either the vocabulary size or the embedding size.
        extra_features: Union[np.ndarray, pd.DataFrame] = None,
            Optional extra features to be used during the computation
            of the embedding. The features must be available for all the
            elements considered for the embedding.
        optimizer: Union[str, Optimizer] = "nadam",
            The optimizer to be used during the training of the model.
            By default, if None is provided, Nadam with learning rate
            set at 0.01 is used.
        alpha: float = 0.75,
            Alpha to use for the function.
        random_state: int = 42,
            The random state to reproduce the training sequence.
        directed: bool = False,
            Whether to treat the data as directed or not.
        use_gradient_centralization: bool = True,
            Whether to wrap the provided optimizer into a normalized
            one that centralizes the gradient.
            It is automatically enabled if the current version of
            TensorFlow supports gradient transformers.
            More detail here: https://arxiv.org/pdf/2004.01461.pdf
        """
        self._alpha = alpha
        self._random_state = random_state
        self._directed = directed
        super().__init__(
            vocabulary_size=vocabulary_size,
            embedding_size=embedding_size,
            embedding=embedding,
            extra_features=extra_features,
            optimizer=optimizer,
            use_gradient_centralization=use_gradient_centralization
        )

    def _glove_loss(self, y_true: tf.Tensor, y_pred: tf.Tensor) -> float:
        """Compute the glove loss function.

        Parameters
        ---------------------------
        y_true: tf.Tensor,
            The true values Tensor for this batch.
        y_pred: tf.Tensor,
            The predicted values Tensor for this batch.

        Returns
        ---------------------------
        Loss function score related to this batch.
        """
        return K.sum(
            K.pow(K.clip(y_true, 0.0, 1.0), self._alpha) *
            K.square(y_pred - K.log(y_true)),
            axis=-1
        )

    def _build_model(self):
        """Create new Glove model."""
        # Creating the input layers
        left_input_layer = Input((1,), name="left_input_layer")
        right_input_layer = Input((1,), name="right_input_layer")

        trainable_left_embedding = Embedding(
            self._vocabulary_size,
            self._embedding_size,
            input_length=1,
            weights=None if self._embedding is None else [
                self._embedding
            ],
            name=Embedder.TERMS_EMBEDDING_LAYER_NAME
        )(left_input_layer)

        trainable_right_embedding = Embedding(
            self._vocabulary_size,
            self._embedding_size,
            input_length=1,
        )(right_input_layer)

        if self._extra_features is not None:
            extra_features_matrix = Embedding(
                *self._extra_features,
                input_length=1,
                weights=self._extra_features,
                trainable=False,
                name="extra_features_matrix"
            )
            trainable_left_embedding = Concatenate()([
                extra_features_matrix(left_input_layer),
                trainable_left_embedding
            ])
            trainable_right_embedding = Concatenate()([
                extra_features_matrix(right_input_layer),
                trainable_right_embedding
            ])

        # Creating the dot product of the embedding layers
        dot_product_layer = Dot(axes=2)([
            trainable_left_embedding,
            trainable_right_embedding
        ])

        # Creating the biases layer
        biases = [
            Embedding(self._vocabulary_size, 1, input_length=1)(input_layer)
            for input_layer in (left_input_layer, right_input_layer)
        ]

        # Concatenating with an add the three layers
        prediction = Flatten()(Add()([dot_product_layer, *biases]))

        # Creating the model
        glove = Model(
            inputs=[
                left_input_layer,
                right_input_layer
            ],
            outputs=prediction,
            name="GloVe"
        )

        return glove

    def _compile_model(self) -> Model:
        """Compile model."""
        self._model.compile(
            loss=self._glove_loss,
            optimizer=self._optimizer
        )

    def fit(
        self,
        X: Tuple[np.ndarray, np.ndarray],
        frequencies: np.ndarray,
        *args: List,
        epochs: int = 1000,
        batch_size: int = 2**20,
        early_stopping_monitor: str = "loss",
        early_stopping_min_delta: float = 0.001,
        early_stopping_patience: int = 10,
        early_stopping_mode: str = "min",
        reduce_lr_monitor: str = "loss",
        reduce_lr_min_delta: float = 0.01,
        reduce_lr_patience: int = 10,
        reduce_lr_mode: str = "min",
        reduce_lr_factor: float = 0.9,
        verbose: int = 1,
        **kwargs: Dict
    ) -> pd.DataFrame:
        """Return pandas dataframe with training history.

        Parameters
        -----------------------
        X: Tuple[np.ndarray, np.ndarray],
            Tuple with source and destinations.
        frequencies: np.ndarray,
            The frequencies to predict.
        *args: List,
            Other arguments to provide to the model.
        epochs: int = 1000,
            Epochs to train the model for.
        batch_size: int = 2**20,
            The batch size.
            Tipically batch sizes for the GloVe model can be immense.
        early_stopping_monitor: str = "loss",
            Metric to monitor for early stopping.
        early_stopping_min_delta: float = 0.001,
            Minimum delta of metric to stop the training.
        early_stopping_patience: int = 10,
            Number of epochs to wait for when the given minimum delta is not
            achieved after which trigger early stopping.
        early_stopping_mode: str = "min",
            Direction of the variation of the monitored metric for early stopping.
        reduce_lr_monitor: str = "loss",
            Metric to monitor for reducing learning rate.
        reduce_lr_min_delta: float = 0.01,
            Minimum delta of metric to reduce learning rate.
        reduce_lr_patience: int = 10,
            Number of epochs to wait for when the given minimum delta is not
            achieved after which reducing learning rate.
        reduce_lr_mode: str = "min",
            Direction of the variation of the monitored metric for learning rate.
        reduce_lr_factor: float = 0.9,
            Factor for reduction of learning rate.
        verbose: int = 1,
            Wethever to show the loading bar.
            Specifically, the options are:
            * 0 or False: No loading bar.
            * 1 or True: Showing only the loading bar for the epochs.
            * 2: Showing loading bar for both epochs and batches.
        **kwargs: Dict,
            Additional kwargs to pass to the Keras fit call.

        Raises
        -----------------------
        ValueError,
            If given verbose value is not within the available set (-1, 0, 1).

        Returns
        -----------------------
        Dataframe with training history.
        """
        return super().fit(
            GloveSequence(
                *X, frequencies,
                batch_size=batch_size,
                directed=self._directed,
                random_state=self._random_state
            ),
            *args,
            epochs=epochs,
            early_stopping_monitor=early_stopping_monitor,
            early_stopping_min_delta=early_stopping_min_delta,
            early_stopping_patience=early_stopping_patience,
            early_stopping_mode=early_stopping_mode,
            reduce_lr_monitor=reduce_lr_monitor,
            reduce_lr_min_delta=reduce_lr_min_delta,
            reduce_lr_patience=reduce_lr_patience,
            reduce_lr_mode=reduce_lr_mode,
            reduce_lr_factor=reduce_lr_factor,
            verbose=verbose,
            **kwargs
        )

GloVe model for graph and words embedding.

The GloVe model for graph embedding receives two words and is asked to predict its cooccurrence probability.

#   GloVe( vocabulary_size: int, embedding_size: int, embedding: Union[numpy.ndarray, pandas.core.frame.DataFrame] = None, extra_features: Union[numpy.ndarray, pandas.core.frame.DataFrame] = None, optimizer: Union[str, keras.optimizer_v2.optimizer_v2.OptimizerV2] = None, alpha: float = 0.75, random_state: int = 42, directed: bool = False, use_gradient_centralization: bool = True )
View Source
    def __init__(
        self,
        vocabulary_size: int,
        embedding_size: int,
        embedding: Union[np.ndarray, pd.DataFrame] = None,
        extra_features: Union[np.ndarray, pd.DataFrame] = None,
        optimizer: Union[str, Optimizer] = None,
        alpha: float = 0.75,
        random_state: int = 42,
        directed: bool = False,
        use_gradient_centralization: bool = True,
    ):
        """Create new GloVe-based Embedder object.

        Parameters
        -------------------------------
        vocabulary_size: int,
            Number of terms to embed.
            In a graph this is the number of nodes, while in a text is the
            number of the unique words.
        embedding_size: int,
            Dimension of the embedding.
        embedding: Union[np.ndarray, pd.DataFrame] = None,
            The seed embedding to be used.
            Note that it is not possible to provide at once both
            the embedding and either the vocabulary size or the embedding size.
        extra_features: Union[np.ndarray, pd.DataFrame] = None,
            Optional extra features to be used during the computation
            of the embedding. The features must be available for all the
            elements considered for the embedding.
        optimizer: Union[str, Optimizer] = "nadam",
            The optimizer to be used during the training of the model.
            By default, if None is provided, Nadam with learning rate
            set at 0.01 is used.
        alpha: float = 0.75,
            Alpha to use for the function.
        random_state: int = 42,
            The random state to reproduce the training sequence.
        directed: bool = False,
            Whether to treat the data as directed or not.
        use_gradient_centralization: bool = True,
            Whether to wrap the provided optimizer into a normalized
            one that centralizes the gradient.
            It is automatically enabled if the current version of
            TensorFlow supports gradient transformers.
            More detail here: https://arxiv.org/pdf/2004.01461.pdf
        """
        self._alpha = alpha
        self._random_state = random_state
        self._directed = directed
        super().__init__(
            vocabulary_size=vocabulary_size,
            embedding_size=embedding_size,
            embedding=embedding,
            extra_features=extra_features,
            optimizer=optimizer,
            use_gradient_centralization=use_gradient_centralization
        )

Create new GloVe-based Embedder object.

Parameters
  • vocabulary_size (int,): Number of terms to embed. In a graph this is the number of nodes, while in a text is the number of the unique words.
  • embedding_size (int,): Dimension of the embedding.
  • embedding (Union[np.ndarray, pd.DataFrame] = None,): The seed embedding to be used. Note that it is not possible to provide at once both the embedding and either the vocabulary size or the embedding size.
  • extra_features (Union[np.ndarray, pd.DataFrame] = None,): Optional extra features to be used during the computation of the embedding. The features must be available for all the elements considered for the embedding.
  • optimizer (Union[str, Optimizer] = "nadam",): The optimizer to be used during the training of the model. By default, if None is provided, Nadam with learning rate set at 0.01 is used.
  • alpha (float = 0.75,): Alpha to use for the function.
  • random_state (int = 42,): The random state to reproduce the training sequence.
  • directed (bool = False,): Whether to treat the data as directed or not.
  • use_gradient_centralization (bool = True,): Whether to wrap the provided optimizer into a normalized one that centralizes the gradient. It is automatically enabled if the current version of TensorFlow supports gradient transformers. More detail here: https://arxiv.org/pdf/2004.01461.pdf
#   def fit( self, X: Tuple[numpy.ndarray, numpy.ndarray], frequencies: numpy.ndarray, *args: List, epochs: int = 1000, batch_size: int = 1048576, early_stopping_monitor: str = 'loss', early_stopping_min_delta: float = 0.001, early_stopping_patience: int = 10, early_stopping_mode: str = 'min', reduce_lr_monitor: str = 'loss', reduce_lr_min_delta: float = 0.01, reduce_lr_patience: int = 10, reduce_lr_mode: str = 'min', reduce_lr_factor: float = 0.9, verbose: int = 1, **kwargs: Dict ) -> pandas.core.frame.DataFrame:
View Source
    def fit(
        self,
        X: Tuple[np.ndarray, np.ndarray],
        frequencies: np.ndarray,
        *args: List,
        epochs: int = 1000,
        batch_size: int = 2**20,
        early_stopping_monitor: str = "loss",
        early_stopping_min_delta: float = 0.001,
        early_stopping_patience: int = 10,
        early_stopping_mode: str = "min",
        reduce_lr_monitor: str = "loss",
        reduce_lr_min_delta: float = 0.01,
        reduce_lr_patience: int = 10,
        reduce_lr_mode: str = "min",
        reduce_lr_factor: float = 0.9,
        verbose: int = 1,
        **kwargs: Dict
    ) -> pd.DataFrame:
        """Return pandas dataframe with training history.

        Parameters
        -----------------------
        X: Tuple[np.ndarray, np.ndarray],
            Tuple with source and destinations.
        frequencies: np.ndarray,
            The frequencies to predict.
        *args: List,
            Other arguments to provide to the model.
        epochs: int = 1000,
            Epochs to train the model for.
        batch_size: int = 2**20,
            The batch size.
            Tipically batch sizes for the GloVe model can be immense.
        early_stopping_monitor: str = "loss",
            Metric to monitor for early stopping.
        early_stopping_min_delta: float = 0.001,
            Minimum delta of metric to stop the training.
        early_stopping_patience: int = 10,
            Number of epochs to wait for when the given minimum delta is not
            achieved after which trigger early stopping.
        early_stopping_mode: str = "min",
            Direction of the variation of the monitored metric for early stopping.
        reduce_lr_monitor: str = "loss",
            Metric to monitor for reducing learning rate.
        reduce_lr_min_delta: float = 0.01,
            Minimum delta of metric to reduce learning rate.
        reduce_lr_patience: int = 10,
            Number of epochs to wait for when the given minimum delta is not
            achieved after which reducing learning rate.
        reduce_lr_mode: str = "min",
            Direction of the variation of the monitored metric for learning rate.
        reduce_lr_factor: float = 0.9,
            Factor for reduction of learning rate.
        verbose: int = 1,
            Wethever to show the loading bar.
            Specifically, the options are:
            * 0 or False: No loading bar.
            * 1 or True: Showing only the loading bar for the epochs.
            * 2: Showing loading bar for both epochs and batches.
        **kwargs: Dict,
            Additional kwargs to pass to the Keras fit call.

        Raises
        -----------------------
        ValueError,
            If given verbose value is not within the available set (-1, 0, 1).

        Returns
        -----------------------
        Dataframe with training history.
        """
        return super().fit(
            GloveSequence(
                *X, frequencies,
                batch_size=batch_size,
                directed=self._directed,
                random_state=self._random_state
            ),
            *args,
            epochs=epochs,
            early_stopping_monitor=early_stopping_monitor,
            early_stopping_min_delta=early_stopping_min_delta,
            early_stopping_patience=early_stopping_patience,
            early_stopping_mode=early_stopping_mode,
            reduce_lr_monitor=reduce_lr_monitor,
            reduce_lr_min_delta=reduce_lr_min_delta,
            reduce_lr_patience=reduce_lr_patience,
            reduce_lr_mode=reduce_lr_mode,
            reduce_lr_factor=reduce_lr_factor,
            verbose=verbose,
            **kwargs
        )

Return pandas dataframe with training history.

Parameters
  • X (Tuple[np.ndarray, np.ndarray],): Tuple with source and destinations.
  • frequencies (np.ndarray,): The frequencies to predict.
  • *args (List,): Other arguments to provide to the model.
  • epochs (int = 1000,): Epochs to train the model for.
  • batch_size (int = 2**20,): The batch size. Tipically batch sizes for the GloVe model can be immense.
  • early_stopping_monitor (str = "loss",): Metric to monitor for early stopping.
  • early_stopping_min_delta (float = 0.001,): Minimum delta of metric to stop the training.
  • early_stopping_patience (int = 10,): Number of epochs to wait for when the given minimum delta is not achieved after which trigger early stopping.
  • early_stopping_mode (str = "min",): Direction of the variation of the monitored metric for early stopping.
  • reduce_lr_monitor (str = "loss",): Metric to monitor for reducing learning rate.
  • reduce_lr_min_delta (float = 0.01,): Minimum delta of metric to reduce learning rate.
  • reduce_lr_patience (int = 10,): Number of epochs to wait for when the given minimum delta is not achieved after which reducing learning rate.
  • reduce_lr_mode (str = "min",): Direction of the variation of the monitored metric for learning rate.
  • reduce_lr_factor (float = 0.9,): Factor for reduction of learning rate.
  • verbose (int = 1,): Wethever to show the loading bar. Specifically, the options are:
    • 0 or False: No loading bar.
    • 1 or True: Showing only the loading bar for the epochs.
    • 2: Showing loading bar for both epochs and batches.
  • **kwargs (Dict,): Additional kwargs to pass to the Keras fit call.
Raises
  • ValueError,: If given verbose value is not within the available set (-1, 0, 1).
Returns
  • Dataframe with training history.
Inherited Members
grape.embiggen.embedders.embedder.Embedder
TERMS_EMBEDDING_LAYER_NAME
trainable
summary
get_layer_weights
embedding
get_embedding_dataframe
save_embedding
name
save_weights
load_weights
#   class GraphCBOW(grape.embiggen.embedders.node2vec.Node2Vec):
View Source
class GraphCBOW(Node2Vec):
    """GraphCBOW model for graph embedding.

    The GraphCBOW model for graoh embedding receives a list of contexts and tries
    to predict the central word. The model makes use of an NCE loss layer
    during the training process to generate the negatives.
    """

    def __init__(
        self,
        graph: Graph,
        embedding_size: int = 100,
        embedding: Union[np.ndarray, pd.DataFrame] = None,
        extra_features: Union[np.ndarray, pd.DataFrame] = None,
        optimizer: Union[str, Optimizer] = None,
        negative_samples: int = 10,
        walk_length: int = 128,
        batch_size: int = 256,
        iterations: int = 16,
        window_size: int = 4,
        return_weight: float = 1.0,
        explore_weight: float = 1.0,
        change_node_type_weight: float = 1.0,
        change_edge_type_weight: float = 1.0,
        max_neighbours: int = None,
        elapsed_epochs: int = 0,
        support_mirrored_strategy: bool = False,
        random_state: int = 42,
        dense_node_mapping: Dict[int, int] = None,
        use_gradient_centralization: bool = True,
    ):
        """Create new sequence Embedder model.

        Parameters
        -------------------------------------------
        graph: Graph,
            Graph to be embedded.
        word2vec_model: Word2Vec,
            Word2Vec model to use.
        embedding_size: int = 100,
            Dimension of the embedding.
        embedding: Union[np.ndarray, pd.DataFrame] = None,
            The seed embedding to be used.
            Note that it is not possible to provide at once both
            the embedding and either the vocabulary size or the embedding size.
        extra_features: Union[np.ndarray, pd.DataFrame] = None,
            Optional extra features to be used during the computation
            of the embedding. The features must be available for all the
            elements considered for the embedding.
        optimizer: Union[str, Optimizer] = None,
            The optimizer to be used during the training of the model.
            By default, if None is provided, Nadam with learning rate
            set at 0.01 is used.
        window_size: int = 4,
            Window size for the local context.
            On the borders the window size is trimmed.
        negative_samples: int = 10,
            The number of negative classes to randomly sample per batch.
            This single sample of negative classes is evaluated for each element in the batch.
        walk_length: int = 128,
            Maximal length of the walks.
        batch_size: int = 256,
            Number of nodes to include in a single batch.
        iterations: int = 16,
            Number of iterations of the single walks.
        window_size: int = 4,
            Window size for the local context.
            On the borders the window size is trimmed.
        return_weight: float = 1.0,
            Weight on the probability of returning to the same node the walk just came from
            Having this higher tends the walks to be
            more like a Breadth-First Search.
            Having this very high  (> 2) makes search very local.
            Equal to the inverse of p in the Node2Vec paper.
        explore_weight: float = 1.0,
            Weight on the probability of visiting a neighbor node
            to the one we're coming from in the random walk
            Having this higher tends the walks to be
            more like a Depth-First Search.
            Having this very high makes search more outward.
            Having this very low makes search very local.
            Equal to the inverse of q in the Node2Vec paper.
        change_node_type_weight: float = 1.0,
            Weight on the probability of visiting a neighbor node of a
            different type than the previous node. This only applies to
            colored graphs, otherwise it has no impact.
        change_edge_type_weight: float = 1.0,
            Weight on the probability of visiting a neighbor edge of a
            different type than the previous edge. This only applies to
            multigraphs, otherwise it has no impact.
        max_neighbours: int = None,
            Number of maximum neighbours to consider when using approximated walks.
            By default, None, we execute exact random walks.
            This is mainly useful for graphs containing nodes with extremely high degrees.
        elapsed_epochs: int = 0,
            Number of elapsed epochs to init state of generator.
        support_mirrored_strategy: bool = False,
            Wethever to patch support for mirror strategy.
            At the time of writing, TensorFlow's MirrorStrategy does not support
            input values different from floats, therefore to support it we need
            to convert the unsigned int 32 values that represent the indices of
            the embedding layers we receive from Ensmallen to floats.
            This will generally slow down performance, but in the context of
            exploiting multiple GPUs it may be unnoticeable.
        random_state: int = 42,
            The random state to reproduce the training sequence.
        dense_node_mapping: Dict[int, int] = None,
            Mapping to use for converting sparse walk space into a dense space.
            This object can be created using the method (available from the
            graph object created using Graph)
            called `get_dense_node_mapping` that returns a mapping from
            the non trap nodes (those from where a walk could start) and
            maps these nodes into a dense range of values.
        use_gradient_centralization: bool = True,
            Whether to wrap the provided optimizer into a normalized
            one that centralizes the gradient.
            It is automatically enabled if the current version of
            TensorFlow supports gradient transformers.
            More detail here: https://arxiv.org/pdf/2004.01461.pdf
        """
        if not graph.has_nodes_sorted_by_decreasing_outbound_node_degree():
            raise ValueError(
                "The given graph does not have the nodes sorted by decreasing "
                "order, therefore the NCE loss sampling (which follows a zipfian "
                "distribution) would not approximate well the Softmax.\n"
                "In order to sort the given graph in such a way that the node IDs "
                "are sorted by decreasing outbound node degrees, you can use "
                "the Graph method "
                "`graph.sort_by_decreasing_outbound_node_degree()`"
            )
        super().__init__(
            graph=graph,
            word2vec_model=CBOW,
            embedding_size=embedding_size,
            embedding=embedding,
            extra_features=extra_features,
            optimizer=optimizer,
            negative_samples=negative_samples,
            walk_length=walk_length,
            batch_size=batch_size,
            iterations=iterations,
            window_size=window_size,
            return_weight=return_weight,
            explore_weight=explore_weight,
            change_node_type_weight=change_node_type_weight,
            change_edge_type_weight=change_edge_type_weight,
            max_neighbours=max_neighbours,
            elapsed_epochs=elapsed_epochs,
            support_mirrored_strategy=support_mirrored_strategy,
            random_state=random_state,
            dense_node_mapping=dense_node_mapping,
            use_gradient_centralization=use_gradient_centralization
        )

GraphCBOW model for graph embedding.

The GraphCBOW model for graoh embedding receives a list of contexts and tries to predict the central word. The model makes use of an NCE loss layer during the training process to generate the negatives.

#   GraphCBOW( graph: Graph, embedding_size: int = 100, embedding: Union[numpy.ndarray, pandas.core.frame.DataFrame] = None, extra_features: Union[numpy.ndarray, pandas.core.frame.DataFrame] = None, optimizer: Union[str, keras.optimizer_v2.optimizer_v2.OptimizerV2] = None, negative_samples: int = 10, walk_length: int = 128, batch_size: int = 256, iterations: int = 16, window_size: int = 4, return_weight: float = 1.0, explore_weight: float = 1.0, change_node_type_weight: float = 1.0, change_edge_type_weight: float = 1.0, max_neighbours: int = None, elapsed_epochs: int = 0, support_mirrored_strategy: bool = False, random_state: int = 42, dense_node_mapping: Dict[int, int] = None, use_gradient_centralization: bool = True )
View Source
    def __init__(
        self,
        graph: Graph,
        embedding_size: int = 100,
        embedding: Union[np.ndarray, pd.DataFrame] = None,
        extra_features: Union[np.ndarray, pd.DataFrame] = None,
        optimizer: Union[str, Optimizer] = None,
        negative_samples: int = 10,
        walk_length: int = 128,
        batch_size: int = 256,
        iterations: int = 16,
        window_size: int = 4,
        return_weight: float = 1.0,
        explore_weight: float = 1.0,
        change_node_type_weight: float = 1.0,
        change_edge_type_weight: float = 1.0,
        max_neighbours: int = None,
        elapsed_epochs: int = 0,
        support_mirrored_strategy: bool = False,
        random_state: int = 42,
        dense_node_mapping: Dict[int, int] = None,
        use_gradient_centralization: bool = True,
    ):
        """Create new sequence Embedder model.

        Parameters
        -------------------------------------------
        graph: Graph,
            Graph to be embedded.
        word2vec_model: Word2Vec,
            Word2Vec model to use.
        embedding_size: int = 100,
            Dimension of the embedding.
        embedding: Union[np.ndarray, pd.DataFrame] = None,
            The seed embedding to be used.
            Note that it is not possible to provide at once both
            the embedding and either the vocabulary size or the embedding size.
        extra_features: Union[np.ndarray, pd.DataFrame] = None,
            Optional extra features to be used during the computation
            of the embedding. The features must be available for all the
            elements considered for the embedding.
        optimizer: Union[str, Optimizer] = None,
            The optimizer to be used during the training of the model.
            By default, if None is provided, Nadam with learning rate
            set at 0.01 is used.
        window_size: int = 4,
            Window size for the local context.
            On the borders the window size is trimmed.
        negative_samples: int = 10,
            The number of negative classes to randomly sample per batch.
            This single sample of negative classes is evaluated for each element in the batch.
        walk_length: int = 128,
            Maximal length of the walks.
        batch_size: int = 256,
            Number of nodes to include in a single batch.
        iterations: int = 16,
            Number of iterations of the single walks.
        window_size: int = 4,
            Window size for the local context.
            On the borders the window size is trimmed.
        return_weight: float = 1.0,
            Weight on the probability of returning to the same node the walk just came from
            Having this higher tends the walks to be
            more like a Breadth-First Search.
            Having this very high  (> 2) makes search very local.
            Equal to the inverse of p in the Node2Vec paper.
        explore_weight: float = 1.0,
            Weight on the probability of visiting a neighbor node
            to the one we're coming from in the random walk
            Having this higher tends the walks to be
            more like a Depth-First Search.
            Having this very high makes search more outward.
            Having this very low makes search very local.
            Equal to the inverse of q in the Node2Vec paper.
        change_node_type_weight: float = 1.0,
            Weight on the probability of visiting a neighbor node of a
            different type than the previous node. This only applies to
            colored graphs, otherwise it has no impact.
        change_edge_type_weight: float = 1.0,
            Weight on the probability of visiting a neighbor edge of a
            different type than the previous edge. This only applies to
            multigraphs, otherwise it has no impact.
        max_neighbours: int = None,
            Number of maximum neighbours to consider when using approximated walks.
            By default, None, we execute exact random walks.
            This is mainly useful for graphs containing nodes with extremely high degrees.
        elapsed_epochs: int = 0,
            Number of elapsed epochs to init state of generator.
        support_mirrored_strategy: bool = False,
            Wethever to patch support for mirror strategy.
            At the time of writing, TensorFlow's MirrorStrategy does not support
            input values different from floats, therefore to support it we need
            to convert the unsigned int 32 values that represent the indices of
            the embedding layers we receive from Ensmallen to floats.
            This will generally slow down performance, but in the context of
            exploiting multiple GPUs it may be unnoticeable.
        random_state: int = 42,
            The random state to reproduce the training sequence.
        dense_node_mapping: Dict[int, int] = None,
            Mapping to use for converting sparse walk space into a dense space.
            This object can be created using the method (available from the
            graph object created using Graph)
            called `get_dense_node_mapping` that returns a mapping from
            the non trap nodes (those from where a walk could start) and
            maps these nodes into a dense range of values.
        use_gradient_centralization: bool = True,
            Whether to wrap the provided optimizer into a normalized
            one that centralizes the gradient.
            It is automatically enabled if the current version of
            TensorFlow supports gradient transformers.
            More detail here: https://arxiv.org/pdf/2004.01461.pdf
        """
        if not graph.has_nodes_sorted_by_decreasing_outbound_node_degree():
            raise ValueError(
                "The given graph does not have the nodes sorted by decreasing "
                "order, therefore the NCE loss sampling (which follows a zipfian "
                "distribution) would not approximate well the Softmax.\n"
                "In order to sort the given graph in such a way that the node IDs "
                "are sorted by decreasing outbound node degrees, you can use "
                "the Graph method "
                "`graph.sort_by_decreasing_outbound_node_degree()`"
            )
        super().__init__(
            graph=graph,
            word2vec_model=CBOW,
            embedding_size=embedding_size,
            embedding=embedding,
            extra_features=extra_features,
            optimizer=optimizer,
            negative_samples=negative_samples,
            walk_length=walk_length,
            batch_size=batch_size,
            iterations=iterations,
            window_size=window_size,
            return_weight=return_weight,
            explore_weight=explore_weight,
            change_node_type_weight=change_node_type_weight,
            change_edge_type_weight=change_edge_type_weight,
            max_neighbours=max_neighbours,
            elapsed_epochs=elapsed_epochs,
            support_mirrored_strategy=support_mirrored_strategy,
            random_state=random_state,
            dense_node_mapping=dense_node_mapping,
            use_gradient_centralization=use_gradient_centralization
        )

Create new sequence Embedder model.

Parameters
  • graph (Graph,): Graph to be embedded.
  • word2vec_model (Word2Vec,): Word2Vec model to use.
  • embedding_size (int = 100,): Dimension of the embedding.
  • embedding (Union[np.ndarray, pd.DataFrame] = None,): The seed embedding to be used. Note that it is not possible to provide at once both the embedding and either the vocabulary size or the embedding size.
  • extra_features (Union[np.ndarray, pd.DataFrame] = None,): Optional extra features to be used during the computation of the embedding. The features must be available for all the elements considered for the embedding.
  • optimizer (Union[str, Optimizer] = None,): The optimizer to be used during the training of the model. By default, if None is provided, Nadam with learning rate set at 0.01 is used.
  • window_size (int = 4,): Window size for the local context. On the borders the window size is trimmed.
  • negative_samples (int = 10,): The number of negative classes to randomly sample per batch. This single sample of negative classes is evaluated for each element in the batch.
  • walk_length (int = 128,): Maximal length of the walks.
  • batch_size (int = 256,): Number of nodes to include in a single batch.
  • iterations (int = 16,): Number of iterations of the single walks.
  • window_size (int = 4,): Window size for the local context. On the borders the window size is trimmed.
  • return_weight (float = 1.0,): Weight on the probability of returning to the same node the walk just came from Having this higher tends the walks to be more like a Breadth-First Search. Having this very high (> 2) makes search very local. Equal to the inverse of p in the Node2Vec paper.
  • explore_weight (float = 1.0,): Weight on the probability of visiting a neighbor node to the one we're coming from in the random walk Having this higher tends the walks to be more like a Depth-First Search. Having this very high makes search more outward. Having this very low makes search very local. Equal to the inverse of q in the Node2Vec paper.
  • change_node_type_weight (float = 1.0,): Weight on the probability of visiting a neighbor node of a different type than the previous node. This only applies to colored graphs, otherwise it has no impact.
  • change_edge_type_weight (float = 1.0,): Weight on the probability of visiting a neighbor edge of a different type than the previous edge. This only applies to multigraphs, otherwise it has no impact.
  • max_neighbours (int = None,): Number of maximum neighbours to consider when using approximated walks. By default, None, we execute exact random walks. This is mainly useful for graphs containing nodes with extremely high degrees.
  • elapsed_epochs (int = 0,): Number of elapsed epochs to init state of generator.
  • support_mirrored_strategy (bool = False,): Wethever to patch support for mirror strategy. At the time of writing, TensorFlow's MirrorStrategy does not support input values different from floats, therefore to support it we need to convert the unsigned int 32 values that represent the indices of the embedding layers we receive from Ensmallen to floats. This will generally slow down performance, but in the context of exploiting multiple GPUs it may be unnoticeable.
  • random_state (int = 42,): The random state to reproduce the training sequence.
  • dense_node_mapping (Dict[int, int] = None,): Mapping to use for converting sparse walk space into a dense space. This object can be created using the method (available from the graph object created using Graph) called get_dense_node_mapping that returns a mapping from the non trap nodes (those from where a walk could start) and maps these nodes into a dense range of values.
  • use_gradient_centralization (bool = True,): Whether to wrap the provided optimizer into a normalized one that centralizes the gradient. It is automatically enabled if the current version of TensorFlow supports gradient transformers. More detail here: https://arxiv.org/pdf/2004.01461.pdf
Inherited Members
grape.embiggen.embedders.node2vec.Node2Vec
fit
summary
embedding
trainable
get_embedding_dataframe
save_embedding
name
save_weights
load_weights
#   class GraphSkipGram(grape.embiggen.embedders.node2vec.Node2Vec):
View Source
class GraphSkipGram(Node2Vec):
    """GraphSkipGram model for graph embedding.

    The SkipGram model for graoh embedding receives a central word and tries
    to predict its contexts. The model makes use of an NCE loss layer
    during the training process to generate the negatives.
    """

    def __init__(
        self,
        graph: Graph,
        embedding_size: int = 100,
        embedding: Union[np.ndarray, pd.DataFrame] = None,
        extra_features: Union[np.ndarray, pd.DataFrame] = None,
        optimizer: Union[str, Optimizer] = None,
        negative_samples: int = 10,
        walk_length: int = 128,
        batch_size: int = 256,
        iterations: int = 16,
        window_size: int = 3,
        return_weight: float = 1.0,
        explore_weight: float = 1.0,
        change_node_type_weight: float = 1.0,
        change_edge_type_weight: float = 1.0,
        max_neighbours: int = None,
        elapsed_epochs: int = 0,
        support_mirrored_strategy: bool = False,
        random_state: int = 42,
        dense_node_mapping: Dict[int, int] = None,
        use_gradient_centralization: bool = True,
    ):
        """Create new sequence Embedder model.

        Parameters
        -------------------------------------------
        graph: Graph,
            Graph to be embedded.
        word2vec_model: Word2Vec,
            Word2Vec model to use.
        embedding_size: int = 100,
            Dimension of the embedding.
        embedding: Union[np.ndarray, pd.DataFrame] = None,
            The seed embedding to be used.
            Note that it is not possible to provide at once both
            the embedding and either the vocabulary size or the embedding size.
        extra_features: Union[np.ndarray, pd.DataFrame] = None,
            Optional extra features to be used during the computation
            of the embedding. The features must be available for all the
            elements considered for the embedding.
        optimizer: Union[str, Optimizer] = None,
            The optimizer to be used during the training of the model.
            By default, if None is provided, Nadam with learning rate
            set at 0.01 is used.
        window_size: int = 4,
            Window size for the local context.
            On the borders the window size is trimmed.
        negative_samples: int = 10,
            The number of negative classes to randomly sample per batch.
            This single sample of negative classes is evaluated for each element in the batch.
        walk_length: int = 128,
            Maximal length of the walks.
        batch_size: int = 256,
            Number of nodes to include in a single batch.
        iterations: int = 16,
            Number of iterations of the single walks.
        window_size: int = 4,
            Window size for the local context.
            On the borders the window size is trimmed.
        return_weight: float = 1.0,
            Weight on the probability of returning to the same node the walk just came from
            Having this higher tends the walks to be
            more like a Breadth-First Search.
            Having this very high  (> 2) makes search very local.
            Equal to the inverse of p in the Node2Vec paper.
        explore_weight: float = 1.0,
            Weight on the probability of visiting a neighbor node
            to the one we're coming from in the random walk
            Having this higher tends the walks to be
            more like a Depth-First Search.
            Having this very high makes search more outward.
            Having this very low makes search very local.
            Equal to the inverse of q in the Node2Vec paper.
        change_node_type_weight: float = 1.0,
            Weight on the probability of visiting a neighbor node of a
            different type than the previous node. This only applies to
            colored graphs, otherwise it has no impact.
        change_edge_type_weight: float = 1.0,
            Weight on the probability of visiting a neighbor edge of a
            different type than the previous edge. This only applies to
            multigraphs, otherwise it has no impact.
        max_neighbours: int = None,
            Number of maximum neighbours to consider when using approximated walks.
            By default, None, we execute exact random walks.
            This is mainly useful for graphs containing nodes with extremely high degrees.
        elapsed_epochs: int = 0,
            Number of elapsed epochs to init state of generator.
        support_mirrored_strategy: bool = False,
            Wethever to patch support for mirror strategy.
            At the time of writing, TensorFlow's MirrorStrategy does not support
            input values different from floats, therefore to support it we need
            to convert the unsigned int 32 values that represent the indices of
            the embedding layers we receive from Ensmallen to floats.
            This will generally slow down performance, but in the context of
            exploiting multiple GPUs it may be unnoticeable.
        random_state: int = 42,
            The random state to reproduce the training sequence.
        dense_node_mapping: Dict[int, int] = None,
            Mapping to use for converting sparse walk space into a dense space.
            This object can be created using the method (available from the
            graph object created using Graph)
            called `get_dense_node_mapping` that returns a mapping from
            the non trap nodes (those from where a walk could start) and
            maps these nodes into a dense range of values.
        use_gradient_centralization: bool = True,
            Whether to wrap the provided optimizer into a normalized
            one that centralizes the gradient.
            It is automatically enabled if the current version of
            TensorFlow supports gradient transformers.
            More detail here: https://arxiv.org/pdf/2004.01461.pdf
        """
        if not graph.has_nodes_sorted_by_decreasing_outbound_node_degree():
            raise ValueError(
                "The given graph does not have the nodes sorted by decreasing "
                "order, therefore the NCE loss sampling (which follows a zipfian "
                "distribution) would not approximate well the Softmax.\n"
                "In order to sort the given graph in such a way that the node IDs "
                "are sorted by decreasing outbound node degrees, you can use "
                "the Graph method "
                "`graph.sort_by_decreasing_outbound_node_degree()`"
            )

        super().__init__(
            graph=graph,
            word2vec_model=SkipGram,
            embedding_size=embedding_size,
            embedding=embedding,
            extra_features=extra_features,
            optimizer=optimizer,
            negative_samples=negative_samples,
            walk_length=walk_length,
            batch_size=batch_size,
            iterations=iterations,
            window_size=window_size,
            return_weight=return_weight,
            explore_weight=explore_weight,
            change_node_type_weight=change_node_type_weight,
            change_edge_type_weight=change_edge_type_weight,
            max_neighbours=max_neighbours,
            elapsed_epochs=elapsed_epochs,
            support_mirrored_strategy=support_mirrored_strategy,
            random_state=random_state,
            dense_node_mapping=dense_node_mapping,
            use_gradient_centralization=use_gradient_centralization
        )

GraphSkipGram model for graph embedding.

The SkipGram model for graoh embedding receives a central word and tries to predict its contexts. The model makes use of an NCE loss layer during the training process to generate the negatives.

#   GraphSkipGram( graph: Graph, embedding_size: int = 100, embedding: Union[numpy.ndarray, pandas.core.frame.DataFrame] = None, extra_features: Union[numpy.ndarray, pandas.core.frame.DataFrame] = None, optimizer: Union[str, keras.optimizer_v2.optimizer_v2.OptimizerV2] = None, negative_samples: int = 10, walk_length: int = 128, batch_size: int = 256, iterations: int = 16, window_size: int = 3, return_weight: float = 1.0, explore_weight: float = 1.0, change_node_type_weight: float = 1.0, change_edge_type_weight: float = 1.0, max_neighbours: int = None, elapsed_epochs: int = 0, support_mirrored_strategy: bool = False, random_state: int = 42, dense_node_mapping: Dict[int, int] = None, use_gradient_centralization: bool = True )
View Source
    def __init__(
        self,
        graph: Graph,
        embedding_size: int = 100,
        embedding: Union[np.ndarray, pd.DataFrame] = None,
        extra_features: Union[np.ndarray, pd.DataFrame] = None,
        optimizer: Union[str, Optimizer] = None,
        negative_samples: int = 10,
        walk_length: int = 128,
        batch_size: int = 256,
        iterations: int = 16,
        window_size: int = 3,
        return_weight: float = 1.0,
        explore_weight: float = 1.0,
        change_node_type_weight: float = 1.0,
        change_edge_type_weight: float = 1.0,
        max_neighbours: int = None,
        elapsed_epochs: int = 0,
        support_mirrored_strategy: bool = False,
        random_state: int = 42,
        dense_node_mapping: Dict[int, int] = None,
        use_gradient_centralization: bool = True,
    ):
        """Create new sequence Embedder model.

        Parameters
        -------------------------------------------
        graph: Graph,
            Graph to be embedded.
        word2vec_model: Word2Vec,
            Word2Vec model to use.
        embedding_size: int = 100,
            Dimension of the embedding.
        embedding: Union[np.ndarray, pd.DataFrame] = None,
            The seed embedding to be used.
            Note that it is not possible to provide at once both
            the embedding and either the vocabulary size or the embedding size.
        extra_features: Union[np.ndarray, pd.DataFrame] = None,
            Optional extra features to be used during the computation
            of the embedding. The features must be available for all the
            elements considered for the embedding.
        optimizer: Union[str, Optimizer] = None,
            The optimizer to be used during the training of the model.
            By default, if None is provided, Nadam with learning rate
            set at 0.01 is used.
        window_size: int = 4,
            Window size for the local context.
            On the borders the window size is trimmed.
        negative_samples: int = 10,
            The number of negative classes to randomly sample per batch.
            This single sample of negative classes is evaluated for each element in the batch.
        walk_length: int = 128,
            Maximal length of the walks.
        batch_size: int = 256,
            Number of nodes to include in a single batch.
        iterations: int = 16,
            Number of iterations of the single walks.
        window_size: int = 4,
            Window size for the local context.
            On the borders the window size is trimmed.
        return_weight: float = 1.0,
            Weight on the probability of returning to the same node the walk just came from
            Having this higher tends the walks to be
            more like a Breadth-First Search.
            Having this very high  (> 2) makes search very local.
            Equal to the inverse of p in the Node2Vec paper.
        explore_weight: float = 1.0,
            Weight on the probability of visiting a neighbor node
            to the one we're coming from in the random walk
            Having this higher tends the walks to be
            more like a Depth-First Search.
            Having this very high makes search more outward.
            Having this very low makes search very local.
            Equal to the inverse of q in the Node2Vec paper.
        change_node_type_weight: float = 1.0,
            Weight on the probability of visiting a neighbor node of a
            different type than the previous node. This only applies to
            colored graphs, otherwise it has no impact.
        change_edge_type_weight: float = 1.0,
            Weight on the probability of visiting a neighbor edge of a
            different type than the previous edge. This only applies to
            multigraphs, otherwise it has no impact.
        max_neighbours: int = None,
            Number of maximum neighbours to consider when using approximated walks.
            By default, None, we execute exact random walks.
            This is mainly useful for graphs containing nodes with extremely high degrees.
        elapsed_epochs: int = 0,
            Number of elapsed epochs to init state of generator.
        support_mirrored_strategy: bool = False,
            Wethever to patch support for mirror strategy.
            At the time of writing, TensorFlow's MirrorStrategy does not support
            input values different from floats, therefore to support it we need
            to convert the unsigned int 32 values that represent the indices of
            the embedding layers we receive from Ensmallen to floats.
            This will generally slow down performance, but in the context of
            exploiting multiple GPUs it may be unnoticeable.
        random_state: int = 42,
            The random state to reproduce the training sequence.
        dense_node_mapping: Dict[int, int] = None,
            Mapping to use for converting sparse walk space into a dense space.
            This object can be created using the method (available from the
            graph object created using Graph)
            called `get_dense_node_mapping` that returns a mapping from
            the non trap nodes (those from where a walk could start) and
            maps these nodes into a dense range of values.
        use_gradient_centralization: bool = True,
            Whether to wrap the provided optimizer into a normalized
            one that centralizes the gradient.
            It is automatically enabled if the current version of
            TensorFlow supports gradient transformers.
            More detail here: https://arxiv.org/pdf/2004.01461.pdf
        """
        if not graph.has_nodes_sorted_by_decreasing_outbound_node_degree():
            raise ValueError(
                "The given graph does not have the nodes sorted by decreasing "
                "order, therefore the NCE loss sampling (which follows a zipfian "
                "distribution) would not approximate well the Softmax.\n"
                "In order to sort the given graph in such a way that the node IDs "
                "are sorted by decreasing outbound node degrees, you can use "
                "the Graph method "
                "`graph.sort_by_decreasing_outbound_node_degree()`"
            )

        super().__init__(
            graph=graph,
            word2vec_model=SkipGram,
            embedding_size=embedding_size,
            embedding=embedding,
            extra_features=extra_features,
            optimizer=optimizer,
            negative_samples=negative_samples,
            walk_length=walk_length,
            batch_size=batch_size,
            iterations=iterations,
            window_size=window_size,
            return_weight=return_weight,
            explore_weight=explore_weight,
            change_node_type_weight=change_node_type_weight,
            change_edge_type_weight=change_edge_type_weight,
            max_neighbours=max_neighbours,
            elapsed_epochs=elapsed_epochs,
            support_mirrored_strategy=support_mirrored_strategy,
            random_state=random_state,
            dense_node_mapping=dense_node_mapping,
            use_gradient_centralization=use_gradient_centralization
        )

Create new sequence Embedder model.

Parameters
  • graph (Graph,): Graph to be embedded.
  • word2vec_model (Word2Vec,): Word2Vec model to use.
  • embedding_size (int = 100,): Dimension of the embedding.
  • embedding (Union[np.ndarray, pd.DataFrame] = None,): The seed embedding to be used. Note that it is not possible to provide at once both the embedding and either the vocabulary size or the embedding size.
  • extra_features (Union[np.ndarray, pd.DataFrame] = None,): Optional extra features to be used during the computation of the embedding. The features must be available for all the elements considered for the embedding.
  • optimizer (Union[str, Optimizer] = None,): The optimizer to be used during the training of the model. By default, if None is provided, Nadam with learning rate set at 0.01 is used.
  • window_size (int = 4,): Window size for the local context. On the borders the window size is trimmed.
  • negative_samples (int = 10,): The number of negative classes to randomly sample per batch. This single sample of negative classes is evaluated for each element in the batch.
  • walk_length (int = 128,): Maximal length of the walks.
  • batch_size (int = 256,): Number of nodes to include in a single batch.
  • iterations (int = 16,): Number of iterations of the single walks.
  • window_size (int = 4,): Window size for the local context. On the borders the window size is trimmed.
  • return_weight (float = 1.0,): Weight on the probability of returning to the same node the walk just came from Having this higher tends the walks to be more like a Breadth-First Search. Having this very high (> 2) makes search very local. Equal to the inverse of p in the Node2Vec paper.
  • explore_weight (float = 1.0,): Weight on the probability of visiting a neighbor node to the one we're coming from in the random walk Having this higher tends the walks to be more like a Depth-First Search. Having this very high makes search more outward. Having this very low makes search very local. Equal to the inverse of q in the Node2Vec paper.
  • change_node_type_weight (float = 1.0,): Weight on the probability of visiting a neighbor node of a different type than the previous node. This only applies to colored graphs, otherwise it has no impact.
  • change_edge_type_weight (float = 1.0,): Weight on the probability of visiting a neighbor edge of a different type than the previous edge. This only applies to multigraphs, otherwise it has no impact.
  • max_neighbours (int = None,): Number of maximum neighbours to consider when using approximated walks. By default, None, we execute exact random walks. This is mainly useful for graphs containing nodes with extremely high degrees.
  • elapsed_epochs (int = 0,): Number of elapsed epochs to init state of generator.
  • support_mirrored_strategy (bool = False,): Wethever to patch support for mirror strategy. At the time of writing, TensorFlow's MirrorStrategy does not support input values different from floats, therefore to support it we need to convert the unsigned int 32 values that represent the indices of the embedding layers we receive from Ensmallen to floats. This will generally slow down performance, but in the context of exploiting multiple GPUs it may be unnoticeable.
  • random_state (int = 42,): The random state to reproduce the training sequence.
  • dense_node_mapping (Dict[int, int] = None,): Mapping to use for converting sparse walk space into a dense space. This object can be created using the method (available from the graph object created using Graph) called get_dense_node_mapping that returns a mapping from the non trap nodes (those from where a walk could start) and maps these nodes into a dense range of values.
  • use_gradient_centralization (bool = True,): Whether to wrap the provided optimizer into a normalized one that centralizes the gradient. It is automatically enabled if the current version of TensorFlow supports gradient transformers. More detail here: https://arxiv.org/pdf/2004.01461.pdf
Inherited Members
grape.embiggen.embedders.node2vec.Node2Vec
fit
summary
embedding
trainable
get_embedding_dataframe
save_embedding
name
save_weights
load_weights
#   class GraphGloVe(grape.embiggen.GloVe):
View Source
class GraphGloVe(GloVe):
    """GloVe model for graph and words embedding.

    The GloVe model for graoh embedding receives two words and is asked to
    predict its cooccurrence probability.
    """

    def __init__(
        self,
        graph: Graph,
        embedding_size: int = 100,
        embedding: Union[np.ndarray, pd.DataFrame] = None,
        extra_features: Union[np.ndarray, pd.DataFrame] = None,
        optimizer: Union[str, Optimizer] = None,
        alpha: float = 0.75,
        directed: bool = False,
        walk_length: int = 128,
        iterations: int = 16,
        window_size: int = 4,
        return_weight: float = 1.0,
        explore_weight: float = 1.0,
        change_node_type_weight: float = 1.0,
        change_edge_type_weight: float = 1.0,
        max_neighbours: int = None,
        support_mirrored_strategy: bool = False,
        random_state: int = 42,
        dense_node_mapping: Dict[int, int] = None,
        use_gradient_centralization: bool = True,
    ):
        """Create new GloVe-based Embedder object.

        Parameters
        ----------------------------
        vocabulary_size: int,
            Number of terms to embed.
            In a graph this is the number of nodes, while in a text is the
            number of the unique words.
        embedding_size: int,
            Dimension of the embedding.
        embedding: Union[np.ndarray, pd.DataFrame] = None,
            The seed embedding to be used.
            Note that it is not possible to provide at once both
            the embedding and either the vocabulary size or the embedding size.
        extra_features: Union[np.ndarray, pd.DataFrame] = None,
            Optional extra features to be used during the computation
            of the embedding. The features must be available for all the
            elements considered for the embedding.
        optimizer: Union[str, Optimizer] = "nadam",
            The optimizer to be used during the training of the model.
            By default, if None is provided, Nadam with learning rate
            set at 0.01 is used.
        alpha: float = 0.75,
            Alpha to use for the function.
        directed: bool = False,
            Whether to treat the data as directed or not.
        walk_length: int = 128,
            Maximal length of the walks.
        iterations: int = 16,
            Number of iterations of the single walks.
        window_size: int = 4,
            Window size for the local context.
            On the borders the window size is trimmed.
        return_weight: float = 1.0,
            Weight on the probability of returning to the same node the walk just came from
            Having this higher tends the walks to be
            more like a Breadth-First Search.
            Having this very high  (> 2) makes search very local.
            Equal to the inverse of p in the Node2Vec paper.
        explore_weight: float = 1.0,
            Weight on the probability of visiting a neighbor node
            to the one we're coming from in the random walk
            Having this higher tends the walks to be
            more like a Depth-First Search.
            Having this very high makes search more outward.
            Having this very low makes search very local.
            Equal to the inverse of q in the Node2Vec paper.
        change_node_type_weight: float = 1.0,
            Weight on the probability of visiting a neighbor node of a
            different type than the previous node. This only applies to
            colored graphs, otherwise it has no impact.
        change_edge_type_weight: float = 1.0,
            Weight on the probability of visiting a neighbor edge of a
            different type than the previous edge. This only applies to
            multigraphs, otherwise it has no impact.
        max_neighbours: int = None,
            Number of maximum neighbours to consider when using approximated walks.
            By default, None, we execute exact random walks.
            This is mainly useful for graphs containing nodes with extremely high degrees.
        elapsed_epochs: int = 0,
            Number of elapsed epochs to init state of generator.
        support_mirrored_strategy: bool = False,
            Wethever to patch support for mirror strategy.
            At the time of writing, TensorFlow's MirrorStrategy does not support
            input values different from floats, therefore to support it we need
            to convert the unsigned int 32 values that represent the indices of
            the embedding layers we receive from Ensmallen to floats.
            This will generally slow down performance, but in the context of
            exploiting multiple GPUs it may be unnoticeable.
        random_state: int = 42,
            The random state to reproduce the training sequence.
        dense_node_mapping: Dict[int, int] = None,
            Mapping to use for converting sparse walk space into a dense space.
            This object can be created using the method (available from the
            graph object created using Graph)
            called `get_dense_node_mapping` that returns a mapping from
            the non trap nodes (those from where a walk could start) and
            maps these nodes into a dense range of values.
        use_gradient_centralization: bool = True,
            Whether to wrap the provided optimizer into a normalized
            one that centralizes the gradient.
            It is automatically enabled if the current version of
            TensorFlow supports gradient transformers.
            More detail here: https://arxiv.org/pdf/2004.01461.pdf
        """
        self._graph = graph
        self._walk_length = walk_length
        self._iterations = iterations
        self._window_size = validate_window_size(window_size)
        self._return_weight = return_weight
        self._explore_weight = explore_weight
        self._change_node_type_weight = change_node_type_weight
        self._change_edge_type_weight = change_edge_type_weight
        self._max_neighbours = max_neighbours
        self._support_mirrored_strategy = support_mirrored_strategy
        self._random_state = random_state
        self._dense_node_mapping = dense_node_mapping
        super().__init__(
            alpha=alpha,
            random_state=random_state,
            vocabulary_size=self._graph.get_number_of_nodes(),
            embedding_size=embedding_size,
            embedding=embedding,
            extra_features=extra_features,
            optimizer=optimizer,
            use_gradient_centralization=use_gradient_centralization
        )

    def get_embedding_dataframe(self) -> pd.DataFrame:
        """Return terms embedding using given index names."""
        return super().get_embedding_dataframe(self._graph.get_node_names())

    def fit(
        self,
        epochs: int = 1000,
        batch_size: int = 2**20,
        early_stopping_monitor: str = "loss",
        early_stopping_min_delta: float = 0.0001,
        early_stopping_patience: int = 10,
        early_stopping_mode: str = "min",
        reduce_lr_monitor: str = "loss",
        reduce_lr_min_delta: float = 0.0001,
        reduce_lr_patience: int = 5,
        reduce_lr_mode: str = "min",
        reduce_lr_factor: float = 0.9,
        verbose: int = 2,
        **kwargs: Dict
    ) -> pd.DataFrame:
        """Return pandas dataframe with training history.

        Parameters
        -----------------------
        epochs: int = 1000,
            Epochs to train the model for.
        batch_size: int = 2**20,
            The batch size.
            Tipically batch sizes for the GloVe model can be immense.
        early_stopping_monitor: str = "loss",
            Metric to monitor for early stopping.
        early_stopping_min_delta: float = 0.001,
            Minimum delta of metric to stop the training.
        early_stopping_patience: int = 10,
            Number of epochs to wait for when the given minimum delta is not
            achieved after which trigger early stopping.
        early_stopping_mode: str = "min",
            Direction of the variation of the monitored metric for early stopping.
        reduce_lr_monitor: str = "loss",
            Metric to monitor for reducing learning rate.
        reduce_lr_min_delta: float = 0.01,
            Minimum delta of metric to reduce learning rate.
        reduce_lr_patience: int = 10,
            Number of epochs to wait for when the given minimum delta is not
            achieved after which reducing learning rate.
        reduce_lr_mode: str = "min",
            Direction of the variation of the monitored metric for learning rate.
        reduce_lr_factor: float = 0.9,
            Factor for reduction of learning rate.
        verbose: int = 2,
            Wethever to show the loading bar.
            Specifically, the options are:
            * 0 or False: No loading bar.
            * 1 or True: Showing only the loading bar for the epochs.
            * 2: Showing loading bar for both epochs and batches.
        **kwargs: Dict,
            Additional kwargs to pass to the Keras fit call.

        Raises
        -----------------------
        ValueError,
            If given verbose value is not within the available set (-1, 0, 1).

        Returns
        -----------------------
        Dataframe with training history.
        """
        sources, destinations, frequencies = self._graph.cooccurence_matrix(
            walk_length=self._walk_length,
            window_size=self._window_size,
            iterations=self._iterations,
            return_weight=self._return_weight,
            explore_weight=self._explore_weight,
            change_edge_type_weight=self._change_edge_type_weight,
            change_node_type_weight=self._change_node_type_weight,
            dense_node_mapping=self._dense_node_mapping,
            max_neighbours=self._max_neighbours,
            random_state=self._random_state,
            verbose=verbose > 0
        )
        if self._support_mirrored_strategy:
            sources = sources.astype(float)
            destinations = destinations.astype(float)
        return super().fit(
            (sources, destinations), frequencies,
            epochs=epochs,
            batch_size=batch_size,
            early_stopping_monitor=early_stopping_monitor,
            early_stopping_min_delta=early_stopping_min_delta,
            early_stopping_patience=early_stopping_patience,
            early_stopping_mode=early_stopping_mode,
            reduce_lr_monitor=reduce_lr_monitor,
            reduce_lr_min_delta=reduce_lr_min_delta,
            reduce_lr_patience=reduce_lr_patience,
            reduce_lr_mode=reduce_lr_mode,
            reduce_lr_factor=reduce_lr_factor,
            verbose=verbose,
            **kwargs
        )

GloVe model for graph and words embedding.

The GloVe model for graoh embedding receives two words and is asked to predict its cooccurrence probability.

#   GraphGloVe( graph: Graph, embedding_size: int = 100, embedding: Union[numpy.ndarray, pandas.core.frame.DataFrame] = None, extra_features: Union[numpy.ndarray, pandas.core.frame.DataFrame] = None, optimizer: Union[str, keras.optimizer_v2.optimizer_v2.OptimizerV2] = None, alpha: float = 0.75, directed: bool = False, walk_length: int = 128, iterations: int = 16, window_size: int = 4, return_weight: float = 1.0, explore_weight: float = 1.0, change_node_type_weight: float = 1.0, change_edge_type_weight: float = 1.0, max_neighbours: int = None, support_mirrored_strategy: bool = False, random_state: int = 42, dense_node_mapping: Dict[int, int] = None, use_gradient_centralization: bool = True )
View Source
    def __init__(
        self,
        graph: Graph,
        embedding_size: int = 100,
        embedding: Union[np.ndarray, pd.DataFrame] = None,
        extra_features: Union[np.ndarray, pd.DataFrame] = None,
        optimizer: Union[str, Optimizer] = None,
        alpha: float = 0.75,
        directed: bool = False,
        walk_length: int = 128,
        iterations: int = 16,
        window_size: int = 4,
        return_weight: float = 1.0,
        explore_weight: float = 1.0,
        change_node_type_weight: float = 1.0,
        change_edge_type_weight: float = 1.0,
        max_neighbours: int = None,
        support_mirrored_strategy: bool = False,
        random_state: int = 42,
        dense_node_mapping: Dict[int, int] = None,
        use_gradient_centralization: bool = True,
    ):
        """Create new GloVe-based Embedder object.

        Parameters
        ----------------------------
        vocabulary_size: int,
            Number of terms to embed.
            In a graph this is the number of nodes, while in a text is the
            number of the unique words.
        embedding_size: int,
            Dimension of the embedding.
        embedding: Union[np.ndarray, pd.DataFrame] = None,
            The seed embedding to be used.
            Note that it is not possible to provide at once both
            the embedding and either the vocabulary size or the embedding size.
        extra_features: Union[np.ndarray, pd.DataFrame] = None,
            Optional extra features to be used during the computation
            of the embedding. The features must be available for all the
            elements considered for the embedding.
        optimizer: Union[str, Optimizer] = "nadam",
            The optimizer to be used during the training of the model.
            By default, if None is provided, Nadam with learning rate
            set at 0.01 is used.
        alpha: float = 0.75,
            Alpha to use for the function.
        directed: bool = False,
            Whether to treat the data as directed or not.
        walk_length: int = 128,
            Maximal length of the walks.
        iterations: int = 16,
            Number of iterations of the single walks.
        window_size: int = 4,
            Window size for the local context.
            On the borders the window size is trimmed.
        return_weight: float = 1.0,
            Weight on the probability of returning to the same node the walk just came from
            Having this higher tends the walks to be
            more like a Breadth-First Search.
            Having this very high  (> 2) makes search very local.
            Equal to the inverse of p in the Node2Vec paper.
        explore_weight: float = 1.0,
            Weight on the probability of visiting a neighbor node
            to the one we're coming from in the random walk
            Having this higher tends the walks to be
            more like a Depth-First Search.
            Having this very high makes search more outward.
            Having this very low makes search very local.
            Equal to the inverse of q in the Node2Vec paper.
        change_node_type_weight: float = 1.0,
            Weight on the probability of visiting a neighbor node of a
            different type than the previous node. This only applies to
            colored graphs, otherwise it has no impact.
        change_edge_type_weight: float = 1.0,
            Weight on the probability of visiting a neighbor edge of a
            different type than the previous edge. This only applies to
            multigraphs, otherwise it has no impact.
        max_neighbours: int = None,
            Number of maximum neighbours to consider when using approximated walks.
            By default, None, we execute exact random walks.
            This is mainly useful for graphs containing nodes with extremely high degrees.
        elapsed_epochs: int = 0,
            Number of elapsed epochs to init state of generator.
        support_mirrored_strategy: bool = False,
            Wethever to patch support for mirror strategy.
            At the time of writing, TensorFlow's MirrorStrategy does not support
            input values different from floats, therefore to support it we need
            to convert the unsigned int 32 values that represent the indices of
            the embedding layers we receive from Ensmallen to floats.
            This will generally slow down performance, but in the context of
            exploiting multiple GPUs it may be unnoticeable.
        random_state: int = 42,
            The random state to reproduce the training sequence.
        dense_node_mapping: Dict[int, int] = None,
            Mapping to use for converting sparse walk space into a dense space.
            This object can be created using the method (available from the
            graph object created using Graph)
            called `get_dense_node_mapping` that returns a mapping from
            the non trap nodes (those from where a walk could start) and
            maps these nodes into a dense range of values.
        use_gradient_centralization: bool = True,
            Whether to wrap the provided optimizer into a normalized
            one that centralizes the gradient.
            It is automatically enabled if the current version of
            TensorFlow supports gradient transformers.
            More detail here: https://arxiv.org/pdf/2004.01461.pdf
        """
        self._graph = graph
        self._walk_length = walk_length
        self._iterations = iterations
        self._window_size = validate_window_size(window_size)
        self._return_weight = return_weight
        self._explore_weight = explore_weight
        self._change_node_type_weight = change_node_type_weight
        self._change_edge_type_weight = change_edge_type_weight
        self._max_neighbours = max_neighbours
        self._support_mirrored_strategy = support_mirrored_strategy
        self._random_state = random_state
        self._dense_node_mapping = dense_node_mapping
        super().__init__(
            alpha=alpha,
            random_state=random_state,
            vocabulary_size=self._graph.get_number_of_nodes(),
            embedding_size=embedding_size,
            embedding=embedding,
            extra_features=extra_features,
            optimizer=optimizer,
            use_gradient_centralization=use_gradient_centralization
        )

Create new GloVe-based Embedder object.

Parameters
  • vocabulary_size (int,): Number of terms to embed. In a graph this is the number of nodes, while in a text is the number of the unique words.
  • embedding_size (int,): Dimension of the embedding.
  • embedding (Union[np.ndarray, pd.DataFrame] = None,): The seed embedding to be used. Note that it is not possible to provide at once both the embedding and either the vocabulary size or the embedding size.
  • extra_features (Union[np.ndarray, pd.DataFrame] = None,): Optional extra features to be used during the computation of the embedding. The features must be available for all the elements considered for the embedding.
  • optimizer (Union[str, Optimizer] = "nadam",): The optimizer to be used during the training of the model. By default, if None is provided, Nadam with learning rate set at 0.01 is used.
  • alpha (float = 0.75,): Alpha to use for the function.
  • directed (bool = False,): Whether to treat the data as directed or not.
  • walk_length (int = 128,): Maximal length of the walks.
  • iterations (int = 16,): Number of iterations of the single walks.
  • window_size (int = 4,): Window size for the local context. On the borders the window size is trimmed.
  • return_weight (float = 1.0,): Weight on the probability of returning to the same node the walk just came from Having this higher tends the walks to be more like a Breadth-First Search. Having this very high (> 2) makes search very local. Equal to the inverse of p in the Node2Vec paper.
  • explore_weight (float = 1.0,): Weight on the probability of visiting a neighbor node to the one we're coming from in the random walk Having this higher tends the walks to be more like a Depth-First Search. Having this very high makes search more outward. Having this very low makes search very local. Equal to the inverse of q in the Node2Vec paper.
  • change_node_type_weight (float = 1.0,): Weight on the probability of visiting a neighbor node of a different type than the previous node. This only applies to colored graphs, otherwise it has no impact.
  • change_edge_type_weight (float = 1.0,): Weight on the probability of visiting a neighbor edge of a different type than the previous edge. This only applies to multigraphs, otherwise it has no impact.
  • max_neighbours (int = None,): Number of maximum neighbours to consider when using approximated walks. By default, None, we execute exact random walks. This is mainly useful for graphs containing nodes with extremely high degrees.
  • elapsed_epochs (int = 0,): Number of elapsed epochs to init state of generator.
  • support_mirrored_strategy (bool = False,): Wethever to patch support for mirror strategy. At the time of writing, TensorFlow's MirrorStrategy does not support input values different from floats, therefore to support it we need to convert the unsigned int 32 values that represent the indices of the embedding layers we receive from Ensmallen to floats. This will generally slow down performance, but in the context of exploiting multiple GPUs it may be unnoticeable.
  • random_state (int = 42,): The random state to reproduce the training sequence.
  • dense_node_mapping (Dict[int, int] = None,): Mapping to use for converting sparse walk space into a dense space. This object can be created using the method (available from the graph object created using Graph) called get_dense_node_mapping that returns a mapping from the non trap nodes (those from where a walk could start) and maps these nodes into a dense range of values.
  • use_gradient_centralization (bool = True,): Whether to wrap the provided optimizer into a normalized one that centralizes the gradient. It is automatically enabled if the current version of TensorFlow supports gradient transformers. More detail here: https://arxiv.org/pdf/2004.01461.pdf
#   def get_embedding_dataframe(self) -> pandas.core.frame.DataFrame:
View Source
    def get_embedding_dataframe(self) -> pd.DataFrame:
        """Return terms embedding using given index names."""
        return super().get_embedding_dataframe(self._graph.get_node_names())

Return terms embedding using given index names.

#   def fit( self, epochs: int = 1000, batch_size: int = 1048576, early_stopping_monitor: str = 'loss', early_stopping_min_delta: float = 0.0001, early_stopping_patience: int = 10, early_stopping_mode: str = 'min', reduce_lr_monitor: str = 'loss', reduce_lr_min_delta: float = 0.0001, reduce_lr_patience: int = 5, reduce_lr_mode: str = 'min', reduce_lr_factor: float = 0.9, verbose: int = 2, **kwargs: Dict ) -> pandas.core.frame.DataFrame:
View Source
    def fit(
        self,
        epochs: int = 1000,
        batch_size: int = 2**20,
        early_stopping_monitor: str = "loss",
        early_stopping_min_delta: float = 0.0001,
        early_stopping_patience: int = 10,
        early_stopping_mode: str = "min",
        reduce_lr_monitor: str = "loss",
        reduce_lr_min_delta: float = 0.0001,
        reduce_lr_patience: int = 5,
        reduce_lr_mode: str = "min",
        reduce_lr_factor: float = 0.9,
        verbose: int = 2,
        **kwargs: Dict
    ) -> pd.DataFrame:
        """Return pandas dataframe with training history.

        Parameters
        -----------------------
        epochs: int = 1000,
            Epochs to train the model for.
        batch_size: int = 2**20,
            The batch size.
            Tipically batch sizes for the GloVe model can be immense.
        early_stopping_monitor: str = "loss",
            Metric to monitor for early stopping.
        early_stopping_min_delta: float = 0.001,
            Minimum delta of metric to stop the training.
        early_stopping_patience: int = 10,
            Number of epochs to wait for when the given minimum delta is not
            achieved after which trigger early stopping.
        early_stopping_mode: str = "min",
            Direction of the variation of the monitored metric for early stopping.
        reduce_lr_monitor: str = "loss",
            Metric to monitor for reducing learning rate.
        reduce_lr_min_delta: float = 0.01,
            Minimum delta of metric to reduce learning rate.
        reduce_lr_patience: int = 10,
            Number of epochs to wait for when the given minimum delta is not
            achieved after which reducing learning rate.
        reduce_lr_mode: str = "min",
            Direction of the variation of the monitored metric for learning rate.
        reduce_lr_factor: float = 0.9,
            Factor for reduction of learning rate.
        verbose: int = 2,
            Wethever to show the loading bar.
            Specifically, the options are:
            * 0 or False: No loading bar.
            * 1 or True: Showing only the loading bar for the epochs.
            * 2: Showing loading bar for both epochs and batches.
        **kwargs: Dict,
            Additional kwargs to pass to the Keras fit call.

        Raises
        -----------------------
        ValueError,
            If given verbose value is not within the available set (-1, 0, 1).

        Returns
        -----------------------
        Dataframe with training history.
        """
        sources, destinations, frequencies = self._graph.cooccurence_matrix(
            walk_length=self._walk_length,
            window_size=self._window_size,
            iterations=self._iterations,
            return_weight=self._return_weight,
            explore_weight=self._explore_weight,
            change_edge_type_weight=self._change_edge_type_weight,
            change_node_type_weight=self._change_node_type_weight,
            dense_node_mapping=self._dense_node_mapping,
            max_neighbours=self._max_neighbours,
            random_state=self._random_state,
            verbose=verbose > 0
        )
        if self._support_mirrored_strategy:
            sources = sources.astype(float)
            destinations = destinations.astype(float)
        return super().fit(
            (sources, destinations), frequencies,
            epochs=epochs,
            batch_size=batch_size,
            early_stopping_monitor=early_stopping_monitor,
            early_stopping_min_delta=early_stopping_min_delta,
            early_stopping_patience=early_stopping_patience,
            early_stopping_mode=early_stopping_mode,
            reduce_lr_monitor=reduce_lr_monitor,
            reduce_lr_min_delta=reduce_lr_min_delta,
            reduce_lr_patience=reduce_lr_patience,
            reduce_lr_mode=reduce_lr_mode,
            reduce_lr_factor=reduce_lr_factor,
            verbose=verbose,
            **kwargs
        )

Return pandas dataframe with training history.

Parameters
  • epochs (int = 1000,): Epochs to train the model for.
  • batch_size (int = 2**20,): The batch size. Tipically batch sizes for the GloVe model can be immense.
  • early_stopping_monitor (str = "loss",): Metric to monitor for early stopping.
  • early_stopping_min_delta (float = 0.001,): Minimum delta of metric to stop the training.
  • early_stopping_patience (int = 10,): Number of epochs to wait for when the given minimum delta is not achieved after which trigger early stopping.
  • early_stopping_mode (str = "min",): Direction of the variation of the monitored metric for early stopping.
  • reduce_lr_monitor (str = "loss",): Metric to monitor for reducing learning rate.
  • reduce_lr_min_delta (float = 0.01,): Minimum delta of metric to reduce learning rate.
  • reduce_lr_patience (int = 10,): Number of epochs to wait for when the given minimum delta is not achieved after which reducing learning rate.
  • reduce_lr_mode (str = "min",): Direction of the variation of the monitored metric for learning rate.
  • reduce_lr_factor (float = 0.9,): Factor for reduction of learning rate.
  • verbose (int = 2,): Wethever to show the loading bar. Specifically, the options are:
    • 0 or False: No loading bar.
    • 1 or True: Showing only the loading bar for the epochs.
    • 2: Showing loading bar for both epochs and batches.
  • **kwargs (Dict,): Additional kwargs to pass to the Keras fit call.
Raises
  • ValueError,: If given verbose value is not within the available set (-1, 0, 1).
Returns
  • Dataframe with training history.
Inherited Members
grape.embiggen.embedders.embedder.Embedder
TERMS_EMBEDDING_LAYER_NAME
trainable
summary
get_layer_weights
embedding
save_embedding
name
save_weights
load_weights
#   class Word2VecSequence(grape.embiggen.sequences.abstract_word2vec_sequence.AbstractWord2VecSequence):
View Source
class Word2VecSequence(AbstractWord2VecSequence):
    """Keras Sequence object for running CBOW and SkipGram on texts."""

    def __getitem__(self, idx: int) -> Tuple[Tuple[np.ndarray, np.ndarray], None]:
        """Return batch corresponding to given index.

        The return tuple of tuples is composed of an inner tuple, containing
        the words vector and the vector of vectors of the contexts.
        Depending on the order of the input_layers of the models that can
        accept these data format, one of the vectors is used as training
        input and the other one is used as the output for the NCE loss layer.

        The words vectors and contexts vectors contain numeric IDs, that
        represent the index of the words' embedding column.

        The true output value is None, since no loss function is used after
        the NCE loss, that is implemented as a layer, and this vastly improves
        the speed of the training process since it does not require to allocate
        empty vectors of considerable size for the one-hot encoding process.

        Parameters
        ---------------
        idx: int,
            Index corresponding to batch to be returned.

        Returns
        ---------------
        Tuple of tuples with input data.
        """
        contexts, words = preprocessing.word2vec(
            self._sequences[idx],
            window_size=self._window_size,
        )

        if self._support_mirrored_strategy:
            return (contexts.astype(float), words.astype(float)), None
        return (contexts, words), None

Keras Sequence object for running CBOW and SkipGram on texts.

Inherited Members
grape.embiggen.sequences.abstract_word2vec_sequence.AbstractWord2VecSequence
AbstractWord2VecSequence
on_epoch_end
keras_mixed_sequence.utils.sequence.Sequence
batch_size
reset
elapsed_epochs
sample_number
steps_per_epoch
#   class NodeTransformer:
View Source
class NodeTransformer:
    """NodeTransformer class to convert nodes to edge embeddings."""

    def __init__(
        self,
        numeric_node_ids: bool = False,
        aligned_node_mapping: bool = False,
        support_mirrored_strategy: bool = False,
    ):
        """Create new NodeTransformer object.

        Parameters
        -------------------
        numeric_node_ids: bool = False,
            Wether to return the numeric node IDs instead of the node embedding.
        aligned_node_mapping: bool = False,
            This parameter specifies whether the mapping of the embeddings nodes
            matches the internal node mapping of the given graph.
            If these two mappings do not match, the generated edge embedding
            will be meaningless.
        support_mirrored_strategy: bool = False,
            Wethever to patch support for mirror strategy.
            At the time of writing, TensorFlow's MirrorStrategy does not support
            input values different from floats, therefore to support it we need
            to convert the unsigned int 32 values that represent the indices of
            the embedding layers we receive from Ensmallen to floats.
            This will generally slow down performance, but in the context of
            exploiting multiple GPUs it may be unnoticeable.
        """
        self._numeric_node_ids = numeric_node_ids
        self._support_mirrored_strategy = support_mirrored_strategy
        self._embedding = None
        self._embedding_numpy = None
        self._aligned_node_mapping = aligned_node_mapping

    @property
    def numeric_node_ids(self) -> bool:
        """Return whether the transformer returns numeric node IDs."""
        return self._numeric_node_ids

    def fit(self, embedding: pd.DataFrame):
        """Fit the model.

        Parameters
        -------------------------
        embedding: pd.DataFrame,
            Embedding to use to fit the transformer.
            This is a pandas DataFrame and NOT a numpy array because we need
            to be able to remap correctly the vector embeddings in case of
            graphs that do not respect the same internal node mapping but have
            the same node set. It is possible to remap such graphs using
            Ensmallen's remap method but it may be less intuitive to users.
        """
        if not isinstance(embedding, pd.DataFrame):
            raise ValueError("Given embedding is not a pandas DataFrame.")
        self._embedding = embedding
        self._embedding_numpy = embedding.to_numpy()

    def transform(self, nodes: Union[List[str], List[int]]) -> np.ndarray:
        """Return embeddings from given node.

        Parameters
        --------------------------
        nodes: Union[List[str], List[int]],
            List of nodes whose embedding is to be returned.
            By default this should be a list of strings, if the
            aligned_node_mapping is setted, then this methods also accepts
            a list of ints.

        Raises
        --------------------------
        ValueError,
            If embedding is not fitted.

        Returns
        --------------------------
        Numpy array of embeddings.
        """
        if self._embedding is None and not self.numeric_node_ids:
            raise ValueError(
                "Transformer was not fitted yet."
            )

        if self._aligned_node_mapping:
            if self.numeric_node_ids:
                if self._support_mirrored_strategy:
                    return nodes.astype(float)
                return nodes
            return self._embedding_numpy[nodes]

        if self.numeric_node_ids:
            ids = np.where(self._embedding.index.isin(nodes))
            if self._support_mirrored_strategy:
                return ids.astype(float)
            return ids

        return self._embedding.loc[nodes].to_numpy()

NodeTransformer class to convert nodes to edge embeddings.

#   NodeTransformer( numeric_node_ids: bool = False, aligned_node_mapping: bool = False, support_mirrored_strategy: bool = False )
View Source
    def __init__(
        self,
        numeric_node_ids: bool = False,
        aligned_node_mapping: bool = False,
        support_mirrored_strategy: bool = False,
    ):
        """Create new NodeTransformer object.

        Parameters
        -------------------
        numeric_node_ids: bool = False,
            Wether to return the numeric node IDs instead of the node embedding.
        aligned_node_mapping: bool = False,
            This parameter specifies whether the mapping of the embeddings nodes
            matches the internal node mapping of the given graph.
            If these two mappings do not match, the generated edge embedding
            will be meaningless.
        support_mirrored_strategy: bool = False,
            Wethever to patch support for mirror strategy.
            At the time of writing, TensorFlow's MirrorStrategy does not support
            input values different from floats, therefore to support it we need
            to convert the unsigned int 32 values that represent the indices of
            the embedding layers we receive from Ensmallen to floats.
            This will generally slow down performance, but in the context of
            exploiting multiple GPUs it may be unnoticeable.
        """
        self._numeric_node_ids = numeric_node_ids
        self._support_mirrored_strategy = support_mirrored_strategy
        self._embedding = None
        self._embedding_numpy = None
        self._aligned_node_mapping = aligned_node_mapping

Create new NodeTransformer object.

Parameters
  • numeric_node_ids (bool = False,): Wether to return the numeric node IDs instead of the node embedding.
  • aligned_node_mapping (bool = False,): This parameter specifies whether the mapping of the embeddings nodes matches the internal node mapping of the given graph. If these two mappings do not match, the generated edge embedding will be meaningless.
  • support_mirrored_strategy (bool = False,): Wethever to patch support for mirror strategy. At the time of writing, TensorFlow's MirrorStrategy does not support input values different from floats, therefore to support it we need to convert the unsigned int 32 values that represent the indices of the embedding layers we receive from Ensmallen to floats. This will generally slow down performance, but in the context of exploiting multiple GPUs it may be unnoticeable.
#   numeric_node_ids: bool

Return whether the transformer returns numeric node IDs.

#   def fit(self, embedding: pandas.core.frame.DataFrame):
View Source
    def fit(self, embedding: pd.DataFrame):
        """Fit the model.

        Parameters
        -------------------------
        embedding: pd.DataFrame,
            Embedding to use to fit the transformer.
            This is a pandas DataFrame and NOT a numpy array because we need
            to be able to remap correctly the vector embeddings in case of
            graphs that do not respect the same internal node mapping but have
            the same node set. It is possible to remap such graphs using
            Ensmallen's remap method but it may be less intuitive to users.
        """
        if not isinstance(embedding, pd.DataFrame):
            raise ValueError("Given embedding is not a pandas DataFrame.")
        self._embedding = embedding
        self._embedding_numpy = embedding.to_numpy()

Fit the model.

Parameters
  • embedding (pd.DataFrame,): Embedding to use to fit the transformer. This is a pandas DataFrame and NOT a numpy array because we need to be able to remap correctly the vector embeddings in case of graphs that do not respect the same internal node mapping but have the same node set. It is possible to remap such graphs using Ensmallen's remap method but it may be less intuitive to users.
#   def transform(self, nodes: Union[List[str], List[int]]) -> numpy.ndarray:
View Source
    def transform(self, nodes: Union[List[str], List[int]]) -> np.ndarray:
        """Return embeddings from given node.

        Parameters
        --------------------------
        nodes: Union[List[str], List[int]],
            List of nodes whose embedding is to be returned.
            By default this should be a list of strings, if the
            aligned_node_mapping is setted, then this methods also accepts
            a list of ints.

        Raises
        --------------------------
        ValueError,
            If embedding is not fitted.

        Returns
        --------------------------
        Numpy array of embeddings.
        """
        if self._embedding is None and not self.numeric_node_ids:
            raise ValueError(
                "Transformer was not fitted yet."
            )

        if self._aligned_node_mapping:
            if self.numeric_node_ids:
                if self._support_mirrored_strategy:
                    return nodes.astype(float)
                return nodes
            return self._embedding_numpy[nodes]

        if self.numeric_node_ids:
            ids = np.where(self._embedding.index.isin(nodes))
            if self._support_mirrored_strategy:
                return ids.astype(float)
            return ids

        return self._embedding.loc[nodes].to_numpy()

Return embeddings from given node.

Parameters
  • nodes (Union[List[str], List[int]],): List of nodes whose embedding is to be returned. By default this should be a list of strings, if the aligned_node_mapping is setted, then this methods also accepts a list of ints.
Raises
  • ValueError,: If embedding is not fitted.
Returns
  • Numpy array of embeddings.
#   class EdgeTransformer:
View Source
class EdgeTransformer:
    """EdgeTransformer class to convert edges to edge embeddings."""

    methods = {
        "Hadamard": lambda x1, x2: np.multiply(x1, x2, out=x1),
        "Sum": lambda x1, x2: np.add(x1, x2, out=x1),
        "Average": lambda x1, x2: np.divide(np.add(x1, x2, out=x1), 2, out=x1),
        "L1": lambda x1, x2: np.subtract(x1, x2, out=x1),
        "AbsoluteL1": lambda x1, x2: np.abs(np.subtract(x1, x2, out=x1), out=x1),
        "L2": lambda x1, x2: np.power(np.subtract(x1, x2, out=x1), 2, out=x1),
        "Concatenate": lambda x1, x2: np.hstack((x1, x2)),
        None: lambda x1, x2: np.vstack((x1, x2)).T,
    }

    def __init__(
        self,
        method: str = "Hadamard",
        aligned_node_mapping: bool = False,
        support_mirrored_strategy: bool = False,
    ):
        """Create new EdgeTransformer object.

        Parameters
        ------------------------
        method: str = "Hadamard",
            Method to use for the embedding.
            If None is used, we return instead the numeric tuples.
            Can either be 'Hadamard', 'Sum', 'Average', 'L1', 'AbsoluteL1', 'L2' or 'Concatenate'.
        aligned_node_mapping: bool = False,
            This parameter specifies whether the mapping of the embeddings nodes
            matches the internal node mapping of the given graph.
            If these two mappings do not match, the generated edge embedding
            will be meaningless.
        support_mirrored_strategy: bool = False,
            Wethever to patch support for mirror strategy.
            At the time of writing, TensorFlow's MirrorStrategy does not support
            input values different from floats, therefore to support it we need
            to convert the unsigned int 32 values that represent the indices of
            the embedding layers we receive from Ensmallen to floats.
            This will generally slow down performance, but in the context of
            exploiting multiple GPUs it may be unnoticeable.
        """
        if isinstance(method, str) and method not in EdgeTransformer.methods:
            raise ValueError((
                "Given method '{}' is not supported. "
                "Supported methods are {}, or alternatively a lambda."
            ).format(
                method, ", ".join(list(EdgeTransformer.methods.keys()))
            ))
        self._transformer = NodeTransformer(
            numeric_node_ids=method is None,
            aligned_node_mapping=aligned_node_mapping,
            support_mirrored_strategy=support_mirrored_strategy
        )
        self._method_name = method
        self._method = EdgeTransformer.methods[self._method_name]

    @property
    def numeric_node_ids(self) -> bool:
        """Return whether the transformer returns numeric node IDs."""
        return self._transformer.numeric_node_ids

    @property
    def method(self) -> str:
        """Return the used edge embedding method."""
        return self._method_name

    def fit(self, embedding: pd.DataFrame):
        """Fit the model.

        Parameters
        -------------------------
        embedding: pd.DataFrame,
            Embedding to use to fit the transformer.
            This is a pandas DataFrame and NOT a numpy array because we need
            to be able to remap correctly the vector embeddings in case of
            graphs that do not respect the same internal node mapping but have
            the same node set. It is possible to remap such graphs using
            Ensmallen's remap method but it may be less intuitive to users.

        Raises
        -------------------------
        ValueError,
            If the given method is None there is no need to call the fit method.
        """
        if self._method is None:
            raise ValueError(
                "There is no need to call the fit when edge method is None."
            )
        self._transformer.fit(embedding)

    def transform(self, sources: List[str], destinations: List[str]) -> np.ndarray:
        """Return embedding for given edges using provided method.

        Parameters
        --------------------------
        sources: List[str],
            List of source nodes whose embedding is to be returned.
        destinations: List[str],
            List of destination nodes whose embedding is to be returned.

        Raises
        --------------------------
        ValueError,
            If embedding is not fitted.

        Returns
        --------------------------
        Numpy array of embeddings.
        """
        return self._method(
            self._transformer.transform(sources),
            self._transformer.transform(destinations)
        )

EdgeTransformer class to convert edges to edge embeddings.

#   EdgeTransformer( method: str = 'Hadamard', aligned_node_mapping: bool = False, support_mirrored_strategy: bool = False )
View Source
    def __init__(
        self,
        method: str = "Hadamard",
        aligned_node_mapping: bool = False,
        support_mirrored_strategy: bool = False,
    ):
        """Create new EdgeTransformer object.

        Parameters
        ------------------------
        method: str = "Hadamard",
            Method to use for the embedding.
            If None is used, we return instead the numeric tuples.
            Can either be 'Hadamard', 'Sum', 'Average', 'L1', 'AbsoluteL1', 'L2' or 'Concatenate'.
        aligned_node_mapping: bool = False,
            This parameter specifies whether the mapping of the embeddings nodes
            matches the internal node mapping of the given graph.
            If these two mappings do not match, the generated edge embedding
            will be meaningless.
        support_mirrored_strategy: bool = False,
            Wethever to patch support for mirror strategy.
            At the time of writing, TensorFlow's MirrorStrategy does not support
            input values different from floats, therefore to support it we need
            to convert the unsigned int 32 values that represent the indices of
            the embedding layers we receive from Ensmallen to floats.
            This will generally slow down performance, but in the context of
            exploiting multiple GPUs it may be unnoticeable.
        """
        if isinstance(method, str) and method not in EdgeTransformer.methods:
            raise ValueError((
                "Given method '{}' is not supported. "
                "Supported methods are {}, or alternatively a lambda."
            ).format(
                method, ", ".join(list(EdgeTransformer.methods.keys()))
            ))
        self._transformer = NodeTransformer(
            numeric_node_ids=method is None,
            aligned_node_mapping=aligned_node_mapping,
            support_mirrored_strategy=support_mirrored_strategy
        )
        self._method_name = method
        self._method = EdgeTransformer.methods[self._method_name]

Create new EdgeTransformer object.

Parameters
  • method (str = "Hadamard",): Method to use for the embedding. If None is used, we return instead the numeric tuples. Can either be 'Hadamard', 'Sum', 'Average', 'L1', 'AbsoluteL1', 'L2' or 'Concatenate'.
  • aligned_node_mapping (bool = False,): This parameter specifies whether the mapping of the embeddings nodes matches the internal node mapping of the given graph. If these two mappings do not match, the generated edge embedding will be meaningless.
  • support_mirrored_strategy (bool = False,): Wethever to patch support for mirror strategy. At the time of writing, TensorFlow's MirrorStrategy does not support input values different from floats, therefore to support it we need to convert the unsigned int 32 values that represent the indices of the embedding layers we receive from Ensmallen to floats. This will generally slow down performance, but in the context of exploiting multiple GPUs it may be unnoticeable.
#   methods = {'Hadamard': <function EdgeTransformer.<lambda> at 0x7f108bd6daf0>, 'Sum': <function EdgeTransformer.<lambda> at 0x7f108bd6db80>, 'Average': <function EdgeTransformer.<lambda> at 0x7f108bd6de50>, 'L1': <function EdgeTransformer.<lambda> at 0x7f108bd6dee0>, 'AbsoluteL1': <function EdgeTransformer.<lambda> at 0x7f108bd6df70>, 'L2': <function EdgeTransformer.<lambda> at 0x7f108bd79040>, 'Concatenate': <function EdgeTransformer.<lambda> at 0x7f108bd790d0>, None: <function EdgeTransformer.<lambda> at 0x7f108bd79160>}
#   numeric_node_ids: bool

Return whether the transformer returns numeric node IDs.

#   method: str

Return the used edge embedding method.

#   def fit(self, embedding: pandas.core.frame.DataFrame):
View Source
    def fit(self, embedding: pd.DataFrame):
        """Fit the model.

        Parameters
        -------------------------
        embedding: pd.DataFrame,
            Embedding to use to fit the transformer.
            This is a pandas DataFrame and NOT a numpy array because we need
            to be able to remap correctly the vector embeddings in case of
            graphs that do not respect the same internal node mapping but have
            the same node set. It is possible to remap such graphs using
            Ensmallen's remap method but it may be less intuitive to users.

        Raises
        -------------------------
        ValueError,
            If the given method is None there is no need to call the fit method.
        """
        if self._method is None:
            raise ValueError(
                "There is no need to call the fit when edge method is None."
            )
        self._transformer.fit(embedding)

Fit the model.

Parameters
  • embedding (pd.DataFrame,): Embedding to use to fit the transformer. This is a pandas DataFrame and NOT a numpy array because we need to be able to remap correctly the vector embeddings in case of graphs that do not respect the same internal node mapping but have the same node set. It is possible to remap such graphs using Ensmallen's remap method but it may be less intuitive to users.
Raises
  • ValueError,: If the given method is None there is no need to call the fit method.
#   def transform(self, sources: List[str], destinations: List[str]) -> numpy.ndarray:
View Source
    def transform(self, sources: List[str], destinations: List[str]) -> np.ndarray:
        """Return embedding for given edges using provided method.

        Parameters
        --------------------------
        sources: List[str],
            List of source nodes whose embedding is to be returned.
        destinations: List[str],
            List of destination nodes whose embedding is to be returned.

        Raises
        --------------------------
        ValueError,
            If embedding is not fitted.

        Returns
        --------------------------
        Numpy array of embeddings.
        """
        return self._method(
            self._transformer.transform(sources),
            self._transformer.transform(destinations)
        )

Return embedding for given edges using provided method.

Parameters
  • sources (List[str],): List of source nodes whose embedding is to be returned.
  • destinations (List[str],): List of destination nodes whose embedding is to be returned.
Raises
  • ValueError,: If embedding is not fitted.
Returns
  • Numpy array of embeddings.
#   class GraphTransformer:
View Source
class GraphTransformer:
    """GraphTransformer class to convert graphs to edge embeddings."""

    def __init__(
        self,
        method: str = "Hadamard",
        aligned_node_mapping: bool = False,
        support_mirrored_strategy: bool = False,
    ):
        """Create new GraphTransformer object.

        Parameters
        ------------------------
        method: str = "hadamard",
            Method to use for the embedding.
            Can either be 'Hadamard', 'Sum', 'Average', 'L1', 'AbsoluteL1', 'L2' or 'Concatenate'.
        aligned_node_mapping: bool = False,
            This parameter specifies whether the mapping of the embeddings nodes
            matches the internal node mapping of the given graph.
            If these two mappings do not match, the generated edge embedding
            will be meaningless.
        support_mirrored_strategy: bool = False,
            Wethever to patch support for mirror strategy.
            At the time of writing, TensorFlow's MirrorStrategy does not support
            input values different from floats, therefore to support it we need
            to convert the unsigned int 32 values that represent the indices of
            the embedding layers we receive from Ensmallen to floats.
            This will generally slow down performance, but in the context of
            exploiting multiple GPUs it may be unnoticeable.
        """
        self._transformer = EdgeTransformer(
            method=method,
            aligned_node_mapping=aligned_node_mapping,
            support_mirrored_strategy=support_mirrored_strategy
        )
        self._aligned_node_mapping = aligned_node_mapping

    @property
    def numeric_node_ids(self) -> bool:
        """Return whether the transformer returns numeric node IDs."""
        return self._transformer.numeric_node_ids

    @property
    def method(self) -> str:
        """Return the used edge embedding method."""
        return self._transformer.method

    def fit(self, embedding: pd.DataFrame):
        """Fit the model.

        Parameters
        -------------------------
        embedding: pd.DataFrame,
            Embedding to use to fit the transformer.
            This is a pandas DataFrame and NOT a numpy array because we need
            to be able to remap correctly the vector embeddings in case of
            graphs that do not respect the same internal node mapping but have
            the same node set. It is possible to remap such graphs using
            Ensmallen's remap method but it may be less intuitive to users.
        """
        self._transformer.fit(embedding)

    def transform(
        self,
        graph: Union[Graph, np.ndarray, List[List[str]], List[List[int]]],
    ) -> np.ndarray:
        """Return edge embedding for given graph using provided method.

        Parameters
        --------------------------
        graph: Union[Graph, np.ndarray, List[List[str]], List[List[int]]],
            The graph whose edges are to embed.
            It can either be an Graph or a list of lists of edges.

        Raises
        --------------------------
        ValueError,
            If embedding is not fitted.

        Returns
        --------------------------
        Numpy array of embeddings.
        """
        if isinstance(graph, Graph):
            if self._aligned_node_mapping:
                graph = graph.get_edge__node_ids(directed=False)
            else:
                graph = graph.get_edge_node_names(directed=False)
        if isinstance(graph, List):
            graph = np.array(graph)
        if isinstance(graph, np.ndarray):
            sources = graph[:, 0]
            destinations = graph[:, 1]
        return self._transformer.transform(sources, destinations)

GraphTransformer class to convert graphs to edge embeddings.

#   GraphTransformer( method: str = 'Hadamard', aligned_node_mapping: bool = False, support_mirrored_strategy: bool = False )
View Source
    def __init__(
        self,
        method: str = "Hadamard",
        aligned_node_mapping: bool = False,
        support_mirrored_strategy: bool = False,
    ):
        """Create new GraphTransformer object.

        Parameters
        ------------------------
        method: str = "hadamard",
            Method to use for the embedding.
            Can either be 'Hadamard', 'Sum', 'Average', 'L1', 'AbsoluteL1', 'L2' or 'Concatenate'.
        aligned_node_mapping: bool = False,
            This parameter specifies whether the mapping of the embeddings nodes
            matches the internal node mapping of the given graph.
            If these two mappings do not match, the generated edge embedding
            will be meaningless.
        support_mirrored_strategy: bool = False,
            Wethever to patch support for mirror strategy.
            At the time of writing, TensorFlow's MirrorStrategy does not support
            input values different from floats, therefore to support it we need
            to convert the unsigned int 32 values that represent the indices of
            the embedding layers we receive from Ensmallen to floats.
            This will generally slow down performance, but in the context of
            exploiting multiple GPUs it may be unnoticeable.
        """
        self._transformer = EdgeTransformer(
            method=method,
            aligned_node_mapping=aligned_node_mapping,
            support_mirrored_strategy=support_mirrored_strategy
        )
        self._aligned_node_mapping = aligned_node_mapping

Create new GraphTransformer object.

Parameters
  • method (str = "hadamard",): Method to use for the embedding. Can either be 'Hadamard', 'Sum', 'Average', 'L1', 'AbsoluteL1', 'L2' or 'Concatenate'.
  • aligned_node_mapping (bool = False,): This parameter specifies whether the mapping of the embeddings nodes matches the internal node mapping of the given graph. If these two mappings do not match, the generated edge embedding will be meaningless.
  • support_mirrored_strategy (bool = False,): Wethever to patch support for mirror strategy. At the time of writing, TensorFlow's MirrorStrategy does not support input values different from floats, therefore to support it we need to convert the unsigned int 32 values that represent the indices of the embedding layers we receive from Ensmallen to floats. This will generally slow down performance, but in the context of exploiting multiple GPUs it may be unnoticeable.
#   numeric_node_ids: bool

Return whether the transformer returns numeric node IDs.

#   method: str

Return the used edge embedding method.

#   def fit(self, embedding: pandas.core.frame.DataFrame):
View Source
    def fit(self, embedding: pd.DataFrame):
        """Fit the model.

        Parameters
        -------------------------
        embedding: pd.DataFrame,
            Embedding to use to fit the transformer.
            This is a pandas DataFrame and NOT a numpy array because we need
            to be able to remap correctly the vector embeddings in case of
            graphs that do not respect the same internal node mapping but have
            the same node set. It is possible to remap such graphs using
            Ensmallen's remap method but it may be less intuitive to users.
        """
        self._transformer.fit(embedding)

Fit the model.

Parameters
  • embedding (pd.DataFrame,): Embedding to use to fit the transformer. This is a pandas DataFrame and NOT a numpy array because we need to be able to remap correctly the vector embeddings in case of graphs that do not respect the same internal node mapping but have the same node set. It is possible to remap such graphs using Ensmallen's remap method but it may be less intuitive to users.
#   def transform( self, graph: Union[Graph, numpy.ndarray, List[List[str]], List[List[int]]] ) -> numpy.ndarray:
View Source
    def transform(
        self,
        graph: Union[Graph, np.ndarray, List[List[str]], List[List[int]]],
    ) -> np.ndarray:
        """Return edge embedding for given graph using provided method.

        Parameters
        --------------------------
        graph: Union[Graph, np.ndarray, List[List[str]], List[List[int]]],
            The graph whose edges are to embed.
            It can either be an Graph or a list of lists of edges.

        Raises
        --------------------------
        ValueError,
            If embedding is not fitted.

        Returns
        --------------------------
        Numpy array of embeddings.
        """
        if isinstance(graph, Graph):
            if self._aligned_node_mapping:
                graph = graph.get_edge__node_ids(directed=False)
            else:
                graph = graph.get_edge_node_names(directed=False)
        if isinstance(graph, List):
            graph = np.array(graph)
        if isinstance(graph, np.ndarray):
            sources = graph[:, 0]
            destinations = graph[:, 1]
        return self._transformer.transform(sources, destinations)

Return edge embedding for given graph using provided method.

Parameters
  • graph (Union[Graph, np.ndarray, List[List[str]], List[List[int]]],): The graph whose edges are to embed. It can either be an Graph or a list of lists of edges.
Raises
  • ValueError,: If embedding is not fitted.
Returns
  • Numpy array of embeddings.
#   class CorpusTransformer:
View Source
class CorpusTransformer:
    """Simple class to tekenize textual corpuses."""

    def __init__(
        self,
        synonyms: Dict = None,
        language: str = "english",
        tokenizer_method: str = "nltk",
        apply_stemming: bool = True,
        remove_stop_words: bool = True,
        remove_punctuation: bool = True,
        remove_digits: bool = False,
        extra_stop_words: Set[str] = None,
        min_word_length: int = 2,
        min_sequence_length: int = 0,
        min_count: int = 0,
        max_count: int = math.inf,
        to_lower_case: bool = True,
        verbose: bool = True,
        processes: int = None,
        use_multiprocessing: bool = True
    ):
        """Create new CorpusTransformer object.

        This is a GENERIC text tokenizer and is only useful for basic examples
        as in any advanced settings there will be need for a custom tokenizer.

        Parameters
        ----------------------------
        synonyms: Dict = None,
            The synonyms to use.
        language: str = "english",
            The language for the stopwords.
        tokenizer_method: str = "nltk",
            The tokenizer method to be used.
            Can either be `nltk`, that is, using the nltk default method,
            or alternatively can be `space`, that is splitting only on spaces.
        apply_stemming: bool = True,
            Wethever to apply or not a stemming procedure, which
            by default is enabled.
            The algorithm used is a Porter Stemmer.
        remove_stop_words: bool = True,
            Whether to remove stopwords,
            as defined from NLTK for the given language.
        remove_punctuation: bool = True,
            Whether to remove punctuation, as defined from the string package.
        remove_digits: bool = False,
            Whether to remove words composed of only digits.
        extra_stop_words: Set[str] = None,
            The additional stop words to be removed.
        min_word_length: int = 2,
            Minimum length of the corpus words.
        min_sequence_length: int = 0,
            Minimum length of the tokenized sequences.
            If you are using word2vec, the sequences MUST be longer than
            two times the window size plus one.
        min_count: int = 0,
            Whether to drop terms that appear less than the given amount.
        max_count: int = math.inf,
            Whether to drop terms that appear more than the given amount.
        to_lower_case: bool = True,
            Whether to convert terms to lowercase.
        processes: int = None,
            Number of parallel processes to use.
            If given processes is None, all the available processes is used.
        verbose: bool = True,
            Whether to show loading bars and log process.
        use_multiprocessing: bool = True,
            Whether to use or not multiprocessing.

        Raises
        --------------------------
        ValueError,
            If the given tokenizer method is not supported.
        """
        try:
            import nltk
        except ImportError:
            raise ImportError(
                "The package nltk is not installed!\n"
                "If you need to use the CorpusTransformer object, "
                "please do install the nltk package.\n"
                "This package has to be installed separetely because it "
                "comes with added complexity that we prefer to spare the user "
                "when their main interest lies within graph embedding."
            )
        self._synonyms = {} if synonyms is None else synonyms
        self._stopwords = set() if extra_stop_words is None else extra_stop_words
        if remove_stop_words:
            self._stopwords |= set(stopwords.words(language))
        if remove_punctuation:
            self._stopwords |= set(string.punctuation)
        self._remove_digits = remove_digits
        self._min_word_length = min_word_length
        self._min_count = min_count
        self._max_count = max_count
        self._min_sequence_length = min_sequence_length
        self._to_lower_case = to_lower_case
        self._use_multiprocessing = use_multiprocessing
        self._processes = cpu_count() if processes is None else processes
        self._verbose = verbose
        self._stemmer = PorterStemmer() if apply_stemming else None
        if tokenizer_method not in ("nltk", "space"):
            raise ValueError(
                (
                    "Given tokenizer method `{}` is not supported. "
                    "The supported methods are `nltk` and `space`."
                ).format(tokenizer_method)
            )
        self._tokenizer_method = tokenizer_method
        self._tokenizer = None

    def get_synonym(self, word: str) -> str:
        """Return the synonym of the given word, if available.

        Parameters
        ----------------------------
        word: str,
            The word whose synonym is to be found.

        Returns
        ----------------------------
        The given word synonym.
        """
        return self._synonyms.get(word, word)

    def split_line(self, line: str) -> List[str]:
        """Return preliminary tokenization of the line.

        Parameters
        ---------------------
        line: str,
            The line to be tokenized.

        Returns
        ---------------------
        The list of string tokens.
        """
        if self._to_lower_case:
            line = line.lower()

        if self._tokenizer_method == "nltk":
            return word_tokenize(line)

        return line.split(" ")

    def tokenize_line(self, line: str) -> List[str]:
        """Return tokenized line.

        Parameters
        ---------------------
        line: str,
            The line to be tokenized.

        Returns
        ---------------------
        The list of string tokens.
        """
        return [
            self._stemmer.stem(self.get_synonym(word))
            if self._stemmer is not None
            else self.get_synonym(word)
            for word in word_tokenize(line.lower() if self._to_lower_case else line)
            if word not in self._stopwords and
            len(word) > self._min_word_length and
            (not self._remove_digits or not word.isnumeric())
        ]

    def tokenize_lines(self, lines: List[str]) -> List[List[str]]:
        """Return tokenized lines.

        Parameters
        ---------------------
        lines: List[str],
            List of lines to be tokenized.

        Returns
        ---------------------
        The list of string tokens.
        """
        return [
            self.tokenize_line(line)
            for line in lines
        ]

    def tokenize(self, texts: List[str], return_counts: bool = False):
        """Fit model using stemming from given text.

        Parameters
        ----------------------------
        texts: List[str],
            The text to use to fit the transformer.
        return_counts: bool = False,
            Wethever to return the counts of the terms or not.

        Return
        -----------------------------
        Either the tokens or tuple containing the tokens and the counts.
        """
        processes = min(cpu_count(), len(texts))
        chunks_number = processes*2
        chunk_size = max(len(texts) // chunks_number, 1)
        tasks = (
            texts[i:i + chunk_size]
            for i in range(0, len(texts), chunk_size)
        )
        if self._use_multiprocessing:
            with Pool(processes) as p:
                all_tokens = [
                    line
                    for chunk in tqdm(
                        p.imap(
                            self.tokenize_lines,
                            tasks
                        ),
                        desc="Tokenizing",
                        total=chunks_number,
                        disable=not self._verbose
                    )
                    for line in chunk
                ]
                p.close()
                p.join()
        else:
            all_tokens = [
                line
                for chunk in tqdm(
                    tasks,
                    desc="Tokenizing",
                    total=chunks_number,
                    disable=not self._verbose
                )
                for line in self.tokenize_lines(chunk)
            ]

        if return_counts:
            counter = Counter((
                term
                for terms in tqdm(
                    all_tokens,
                    desc="Computing counts of terms",
                    disable=not self._verbose
                )
                for term in terms
            ))
            return all_tokens, counter
        return all_tokens

    def parse_tokens_for_low_frequency(self, tokens_list: List[List[str]]) -> Generator:
        """Yields tokens parsed according to updated stopwords.

        Parameters
        --------------------
        tokens_list: List[List[str]],
            List of the string tokens.

        Yields
        --------------------
        The filtered out tokens.
        """
        for tokens in tqdm(
            tokens_list,
            total=len(tokens_list),
            desc="Filtering low frequency terms",
            disable=not self._verbose
        ):
            new_tokens = [
                token
                for token in tokens
                if token not in self._stopwords
            ]
            if len(new_tokens) > 0:
                yield new_tokens

    def fit(self, texts: List[str]):
        """Fit the transformer.

        Parameters
        ----------------------------
        texts: List[str],
            The texts to use for the fitting.

        Raises
        ----------------------------
        ValueError,
            If there are NaN values within given texts.
        ValueError,
            If there are non string values within given texts.
        """
        if pd.isna(texts).any():
            raise ValueError(
                "There are NaN values within the given texts."
            )
        if any(not isinstance(text, str) for text in texts):
            raise ValueError(
                "There are not string values within the given texts."
            )
        tokens_list, counts = self.tokenize(texts, True)

        if self._min_count > 0 or math.isfinite(self._max_count):
            self._stopwords |= {
                word
                for word, count in counts.items()
                if count <= self._min_count or count >= self._max_count
            }
            tokens_list = self.parse_tokens_for_low_frequency(tokens_list)

        self._tokenizer = Tokenizer(
            lower=self._to_lower_case
        )
        self._tokenizer.fit_on_texts((
            " ".join(tokens)
            for tokens in tqdm(
                tokens_list,
                desc="Fitting tokenizer",
                total=len(texts),
                disable=not self._verbose
            )
        ))

    @property
    def vocabulary_size(self) -> int:
        """Return number of different terms."""
        return len(self._tokenizer.word_counts)

    def reverse_transform(self, sequences: np.ndarray) -> List[str]:
        """Reverse the sequence to texts.

        Parameters
        ------------------------
        sequences: np.ndarray,
            The sequences to counter transform.

        Returns
        ------------------------
        The texts created from the given sequences.
        """
        if isinstance(sequences, (list, tuple)):
            sequences = np.array(sequences)
        return self._tokenizer.sequences_to_texts(sequences)

    def get_word_id(self, word: str) -> int:
        """Get the given words IDs.

        Parameters
        ------------------------
        word: int
            The word whose ID is to be retrieved.

        Returns
        ------------------------
        The word numeric ID.
        """
        return self._tokenizer.word_index[word]

    def transform(self, texts: List[str]) -> np.ndarray:
        """Transform given text.

        Parameters
        --------------------------
        texts: List[str],
            The texts to encode as digits.

        Raises
        ----------------------------
        ValueError,
            If there are NaN values within given texts.
        ValueError,
            If there are non string values within given texts.

        Returns
        --------------------------
        Numpy array with numpy arrays of tokens.
        """
        if pd.isna(texts).any():
            raise ValueError(
                "There are NaN values within the given texts."
            )
        if any(not isinstance(text, str) for text in texts):
            raise ValueError(
                "There are not string values within the given texts."
            )
        return np.array([
            np.array(tokens, dtype=np.uint64)
            for tokens in self._tokenizer.texts_to_sequences((
                " ".join(tokens)
                for tokens in tqdm(
                    self.tokenize(texts),
                    desc="Transform texts",
                    total=len(texts),
                    disable=not self._verbose
                )
                if len(tokens) >= self._min_sequence_length
            ))
        ], dtype=object)

Simple class to tekenize textual corpuses.

#   CorpusTransformer( synonyms: Dict = None, language: str = 'english', tokenizer_method: str = 'nltk', apply_stemming: bool = True, remove_stop_words: bool = True, remove_punctuation: bool = True, remove_digits: bool = False, extra_stop_words: Set[str] = None, min_word_length: int = 2, min_sequence_length: int = 0, min_count: int = 0, max_count: int = inf, to_lower_case: bool = True, verbose: bool = True, processes: int = None, use_multiprocessing: bool = True )
View Source
    def __init__(
        self,
        synonyms: Dict = None,
        language: str = "english",
        tokenizer_method: str = "nltk",
        apply_stemming: bool = True,
        remove_stop_words: bool = True,
        remove_punctuation: bool = True,
        remove_digits: bool = False,
        extra_stop_words: Set[str] = None,
        min_word_length: int = 2,
        min_sequence_length: int = 0,
        min_count: int = 0,
        max_count: int = math.inf,
        to_lower_case: bool = True,
        verbose: bool = True,
        processes: int = None,
        use_multiprocessing: bool = True
    ):
        """Create new CorpusTransformer object.

        This is a GENERIC text tokenizer and is only useful for basic examples
        as in any advanced settings there will be need for a custom tokenizer.

        Parameters
        ----------------------------
        synonyms: Dict = None,
            The synonyms to use.
        language: str = "english",
            The language for the stopwords.
        tokenizer_method: str = "nltk",
            The tokenizer method to be used.
            Can either be `nltk`, that is, using the nltk default method,
            or alternatively can be `space`, that is splitting only on spaces.
        apply_stemming: bool = True,
            Wethever to apply or not a stemming procedure, which
            by default is enabled.
            The algorithm used is a Porter Stemmer.
        remove_stop_words: bool = True,
            Whether to remove stopwords,
            as defined from NLTK for the given language.
        remove_punctuation: bool = True,
            Whether to remove punctuation, as defined from the string package.
        remove_digits: bool = False,
            Whether to remove words composed of only digits.
        extra_stop_words: Set[str] = None,
            The additional stop words to be removed.
        min_word_length: int = 2,
            Minimum length of the corpus words.
        min_sequence_length: int = 0,
            Minimum length of the tokenized sequences.
            If you are using word2vec, the sequences MUST be longer than
            two times the window size plus one.
        min_count: int = 0,
            Whether to drop terms that appear less than the given amount.
        max_count: int = math.inf,
            Whether to drop terms that appear more than the given amount.
        to_lower_case: bool = True,
            Whether to convert terms to lowercase.
        processes: int = None,
            Number of parallel processes to use.
            If given processes is None, all the available processes is used.
        verbose: bool = True,
            Whether to show loading bars and log process.
        use_multiprocessing: bool = True,
            Whether to use or not multiprocessing.

        Raises
        --------------------------
        ValueError,
            If the given tokenizer method is not supported.
        """
        try:
            import nltk
        except ImportError:
            raise ImportError(
                "The package nltk is not installed!\n"
                "If you need to use the CorpusTransformer object, "
                "please do install the nltk package.\n"
                "This package has to be installed separetely because it "
                "comes with added complexity that we prefer to spare the user "
                "when their main interest lies within graph embedding."
            )
        self._synonyms = {} if synonyms is None else synonyms
        self._stopwords = set() if extra_stop_words is None else extra_stop_words
        if remove_stop_words:
            self._stopwords |= set(stopwords.words(language))
        if remove_punctuation:
            self._stopwords |= set(string.punctuation)
        self._remove_digits = remove_digits
        self._min_word_length = min_word_length
        self._min_count = min_count
        self._max_count = max_count
        self._min_sequence_length = min_sequence_length
        self._to_lower_case = to_lower_case
        self._use_multiprocessing = use_multiprocessing
        self._processes = cpu_count() if processes is None else processes
        self._verbose = verbose
        self._stemmer = PorterStemmer() if apply_stemming else None
        if tokenizer_method not in ("nltk", "space"):
            raise ValueError(
                (
                    "Given tokenizer method `{}` is not supported. "
                    "The supported methods are `nltk` and `space`."
                ).format(tokenizer_method)
            )
        self._tokenizer_method = tokenizer_method
        self._tokenizer = None

Create new CorpusTransformer object.

This is a GENERIC text tokenizer and is only useful for basic examples as in any advanced settings there will be need for a custom tokenizer.

Parameters
  • synonyms (Dict = None,): The synonyms to use.
  • language (str = "english",): The language for the stopwords.
  • tokenizer_method (str = "nltk",): The tokenizer method to be used. Can either be nltk, that is, using the nltk default method, or alternatively can be space, that is splitting only on spaces.
  • apply_stemming (bool = True,): Wethever to apply or not a stemming procedure, which by default is enabled. The algorithm used is a Porter Stemmer.
  • remove_stop_words (bool = True,): Whether to remove stopwords, as defined from NLTK for the given language.
  • remove_punctuation (bool = True,): Whether to remove punctuation, as defined from the string package.
  • remove_digits (bool = False,): Whether to remove words composed of only digits.
  • extra_stop_words (Set[str] = None,): The additional stop words to be removed.
  • min_word_length (int = 2,): Minimum length of the corpus words.
  • min_sequence_length (int = 0,): Minimum length of the tokenized sequences. If you are using word2vec, the sequences MUST be longer than two times the window size plus one.
  • min_count (int = 0,): Whether to drop terms that appear less than the given amount.
  • max_count (int = math.inf,): Whether to drop terms that appear more than the given amount.
  • to_lower_case (bool = True,): Whether to convert terms to lowercase.
  • processes (int = None,): Number of parallel processes to use. If given processes is None, all the available processes is used.
  • verbose (bool = True,): Whether to show loading bars and log process.
  • use_multiprocessing (bool = True,): Whether to use or not multiprocessing.
Raises
  • ValueError,: If the given tokenizer method is not supported.
#   def get_synonym(self, word: str) -> str:
View Source
    def get_synonym(self, word: str) -> str:
        """Return the synonym of the given word, if available.

        Parameters
        ----------------------------
        word: str,
            The word whose synonym is to be found.

        Returns
        ----------------------------
        The given word synonym.
        """
        return self._synonyms.get(word, word)

Return the synonym of the given word, if available.

Parameters
  • word (str,): The word whose synonym is to be found.
Returns
  • The given word synonym.
#   def split_line(self, line: str) -> List[str]:
View Source
    def split_line(self, line: str) -> List[str]:
        """Return preliminary tokenization of the line.

        Parameters
        ---------------------
        line: str,
            The line to be tokenized.

        Returns
        ---------------------
        The list of string tokens.
        """
        if self._to_lower_case:
            line = line.lower()

        if self._tokenizer_method == "nltk":
            return word_tokenize(line)

        return line.split(" ")

Return preliminary tokenization of the line.

Parameters
  • line (str,): The line to be tokenized.
Returns
  • The list of string tokens.
#   def tokenize_line(self, line: str) -> List[str]:
View Source
    def tokenize_line(self, line: str) -> List[str]:
        """Return tokenized line.

        Parameters
        ---------------------
        line: str,
            The line to be tokenized.

        Returns
        ---------------------
        The list of string tokens.
        """
        return [
            self._stemmer.stem(self.get_synonym(word))
            if self._stemmer is not None
            else self.get_synonym(word)
            for word in word_tokenize(line.lower() if self._to_lower_case else line)
            if word not in self._stopwords and
            len(word) > self._min_word_length and
            (not self._remove_digits or not word.isnumeric())
        ]

Return tokenized line.

Parameters
  • line (str,): The line to be tokenized.
Returns
  • The list of string tokens.
#   def tokenize_lines(self, lines: List[str]) -> List[List[str]]:
View Source
    def tokenize_lines(self, lines: List[str]) -> List[List[str]]:
        """Return tokenized lines.

        Parameters
        ---------------------
        lines: List[str],
            List of lines to be tokenized.

        Returns
        ---------------------
        The list of string tokens.
        """
        return [
            self.tokenize_line(line)
            for line in lines
        ]

Return tokenized lines.

Parameters
  • lines (List[str],): List of lines to be tokenized.
Returns
  • The list of string tokens.
#   def tokenize(self, texts: List[str], return_counts: bool = False):
View Source
    def tokenize(self, texts: List[str], return_counts: bool = False):
        """Fit model using stemming from given text.

        Parameters
        ----------------------------
        texts: List[str],
            The text to use to fit the transformer.
        return_counts: bool = False,
            Wethever to return the counts of the terms or not.

        Return
        -----------------------------
        Either the tokens or tuple containing the tokens and the counts.
        """
        processes = min(cpu_count(), len(texts))
        chunks_number = processes*2
        chunk_size = max(len(texts) // chunks_number, 1)
        tasks = (
            texts[i:i + chunk_size]
            for i in range(0, len(texts), chunk_size)
        )
        if self._use_multiprocessing:
            with Pool(processes) as p:
                all_tokens = [
                    line
                    for chunk in tqdm(
                        p.imap(
                            self.tokenize_lines,
                            tasks
                        ),
                        desc="Tokenizing",
                        total=chunks_number,
                        disable=not self._verbose
                    )
                    for line in chunk
                ]
                p.close()
                p.join()
        else:
            all_tokens = [
                line
                for chunk in tqdm(
                    tasks,
                    desc="Tokenizing",
                    total=chunks_number,
                    disable=not self._verbose
                )
                for line in self.tokenize_lines(chunk)
            ]

        if return_counts:
            counter = Counter((
                term
                for terms in tqdm(
                    all_tokens,
                    desc="Computing counts of terms",
                    disable=not self._verbose
                )
                for term in terms
            ))
            return all_tokens, counter
        return all_tokens

Fit model using stemming from given text.

Parameters
  • texts (List[str],): The text to use to fit the transformer.
  • return_counts (bool = False,): Wethever to return the counts of the terms or not.
Return

Either the tokens or tuple containing the tokens and the counts.

#   def parse_tokens_for_low_frequency(self, tokens_list: List[List[str]]) -> Generator:
View Source
    def parse_tokens_for_low_frequency(self, tokens_list: List[List[str]]) -> Generator:
        """Yields tokens parsed according to updated stopwords.

        Parameters
        --------------------
        tokens_list: List[List[str]],
            List of the string tokens.

        Yields
        --------------------
        The filtered out tokens.
        """
        for tokens in tqdm(
            tokens_list,
            total=len(tokens_list),
            desc="Filtering low frequency terms",
            disable=not self._verbose
        ):
            new_tokens = [
                token
                for token in tokens
                if token not in self._stopwords
            ]
            if len(new_tokens) > 0:
                yield new_tokens

Yields tokens parsed according to updated stopwords.

Parameters
  • tokens_list (List[List[str]],): List of the string tokens.
Yields
  • The filtered out tokens.
#   def fit(self, texts: List[str]):
View Source
    def fit(self, texts: List[str]):
        """Fit the transformer.

        Parameters
        ----------------------------
        texts: List[str],
            The texts to use for the fitting.

        Raises
        ----------------------------
        ValueError,
            If there are NaN values within given texts.
        ValueError,
            If there are non string values within given texts.
        """
        if pd.isna(texts).any():
            raise ValueError(
                "There are NaN values within the given texts."
            )
        if any(not isinstance(text, str) for text in texts):
            raise ValueError(
                "There are not string values within the given texts."
            )
        tokens_list, counts = self.tokenize(texts, True)

        if self._min_count > 0 or math.isfinite(self._max_count):
            self._stopwords |= {
                word
                for word, count in counts.items()
                if count <= self._min_count or count >= self._max_count
            }
            tokens_list = self.parse_tokens_for_low_frequency(tokens_list)

        self._tokenizer = Tokenizer(
            lower=self._to_lower_case
        )
        self._tokenizer.fit_on_texts((
            " ".join(tokens)
            for tokens in tqdm(
                tokens_list,
                desc="Fitting tokenizer",
                total=len(texts),
                disable=not self._verbose
            )
        ))

Fit the transformer.

Parameters
  • texts (List[str],): The texts to use for the fitting.
Raises
  • ValueError,: If there are NaN values within given texts.
  • ValueError,: If there are non string values within given texts.
#   vocabulary_size: int

Return number of different terms.

#   def reverse_transform(self, sequences: numpy.ndarray) -> List[str]:
View Source
    def reverse_transform(self, sequences: np.ndarray) -> List[str]:
        """Reverse the sequence to texts.

        Parameters
        ------------------------
        sequences: np.ndarray,
            The sequences to counter transform.

        Returns
        ------------------------
        The texts created from the given sequences.
        """
        if isinstance(sequences, (list, tuple)):
            sequences = np.array(sequences)
        return self._tokenizer.sequences_to_texts(sequences)

Reverse the sequence to texts.

Parameters
  • sequences (np.ndarray,): The sequences to counter transform.
Returns
  • The texts created from the given sequences.
#   def get_word_id(self, word: str) -> int:
View Source
    def get_word_id(self, word: str) -> int:
        """Get the given words IDs.

        Parameters
        ------------------------
        word: int
            The word whose ID is to be retrieved.

        Returns
        ------------------------
        The word numeric ID.
        """
        return self._tokenizer.word_index[word]

Get the given words IDs.

Parameters
  • word (int): The word whose ID is to be retrieved.
Returns
  • The word numeric ID.
#   def transform(self, texts: List[str]) -> numpy.ndarray:
View Source
    def transform(self, texts: List[str]) -> np.ndarray:
        """Transform given text.

        Parameters
        --------------------------
        texts: List[str],
            The texts to encode as digits.

        Raises
        ----------------------------
        ValueError,
            If there are NaN values within given texts.
        ValueError,
            If there are non string values within given texts.

        Returns
        --------------------------
        Numpy array with numpy arrays of tokens.
        """
        if pd.isna(texts).any():
            raise ValueError(
                "There are NaN values within the given texts."
            )
        if any(not isinstance(text, str) for text in texts):
            raise ValueError(
                "There are not string values within the given texts."
            )
        return np.array([
            np.array(tokens, dtype=np.uint64)
            for tokens in self._tokenizer.texts_to_sequences((
                " ".join(tokens)
                for tokens in tqdm(
                    self.tokenize(texts),
                    desc="Transform texts",
                    total=len(texts),
                    disable=not self._verbose
                )
                if len(tokens) >= self._min_sequence_length
            ))
        ], dtype=object)

Transform given text.

Parameters
  • texts (List[str],): The texts to encode as digits.
Raises
  • ValueError,: If there are NaN values within given texts.
  • ValueError,: If there are non string values within given texts.
Returns
  • Numpy array with numpy arrays of tokens.
#   class LinkPredictionTransformer:
View Source
class LinkPredictionTransformer:
    """LinkPredictionTransformer class to convert graphs to edge embeddings."""

    def __init__(
        self,
        method: str = "Hadamard",
        aligned_node_mapping: bool = False,
        support_mirrored_strategy: bool = False,
    ):
        """Create new LinkPredictionTransformer object.

        Parameters
        ------------------------
        method: str = "hadamard",
            Method to use for the embedding.
            Can either be 'Hadamard', 'Sum', 'Average', 'L1', 'AbsoluteL1', 'L2' or 'Concatenate'.
        aligned_node_mapping: bool = False,
            This parameter specifies whether the mapping of the embeddings nodes
            matches the internal node mapping of the given graph.
            If these two mappings do not match, the generated edge embedding
            will be meaningless.
        support_mirrored_strategy: bool = False,
            Wethever to patch support for mirror strategy.
            At the time of writing, TensorFlow's MirrorStrategy does not support
            input values different from floats, therefore to support it we need
            to convert the unsigned int 32 values that represent the indices of
            the embedding layers we receive from Ensmallen to floats.
            This will generally slow down performance, but in the context of
            exploiting multiple GPUs it may be unnoticeable.
        """
        self._transformer = GraphTransformer(
            method=method,
            aligned_node_mapping=aligned_node_mapping,
            support_mirrored_strategy=support_mirrored_strategy
        )

    def fit(self, embedding: pd.DataFrame):
        """Fit the model.

        Parameters
        -------------------------
        embedding: pd.DataFrame,
            Embedding to use to fit the transformer.
            This is a pandas DataFrame and NOT a numpy array because we need
            to be able to remap correctly the vector embeddings in case of
            graphs that do not respect the same internal node mapping but have
            the same node set. It is possible to remap such graphs using
            Ensmallen's remap method but it may be less intuitive to users.
        """
        self._transformer.fit(embedding)

    def transform(
        self,
        positive_graph: Union[Graph, np.ndarray, List[List[str]], List[List[int]]],
        negative_graph: Union[Graph, np.ndarray, List[List[str]], List[List[int]]],
        random_state: int = 42
    ) -> Tuple[np.ndarray, np.ndarray]:
        """Return edge embedding for given graph using provided method.

        Parameters
        --------------------------
        positive_graph: Union[Graph, List[List[str]], List[List[int]]],
            The graph whose edges are to be embedded and labeled as positives.
            It can either be an Graph or a list of lists of edges.
        negative_graph: Union[Graph, List[List[str]], List[List[int]]],
            The graph whose edges are to be embedded and labeled as positives.
            It can either be an Graph or a list of lists of edges.
        random_state: int = 42,
            The random state to use to shuffle the labels.

        Raises
        --------------------------
        ValueError,
            If embedding is not fitted.

        Returns
        --------------------------
        Tuple with X and y values.
        """
        positive_edge_embedding = self._transformer.transform(positive_graph)
        negative_edge_embedding = self._transformer.transform(negative_graph)
        edge_embeddings = np.vstack([
            positive_edge_embedding,
            negative_edge_embedding
        ])
        edge_labels = np.concatenate([
            np.ones(positive_edge_embedding.shape[0]),
            np.zeros(negative_edge_embedding.shape[0])
        ])
        numpy_random_state = np.random.RandomState(  # pylint: disable=no-member
            seed=random_state
        )
        indices = numpy_random_state.permutation(edge_labels.size)

        return edge_embeddings[indices], edge_labels[indices]

LinkPredictionTransformer class to convert graphs to edge embeddings.

#   LinkPredictionTransformer( method: str = 'Hadamard', aligned_node_mapping: bool = False, support_mirrored_strategy: bool = False )
View Source
    def __init__(
        self,
        method: str = "Hadamard",
        aligned_node_mapping: bool = False,
        support_mirrored_strategy: bool = False,
    ):
        """Create new LinkPredictionTransformer object.

        Parameters
        ------------------------
        method: str = "hadamard",
            Method to use for the embedding.
            Can either be 'Hadamard', 'Sum', 'Average', 'L1', 'AbsoluteL1', 'L2' or 'Concatenate'.
        aligned_node_mapping: bool = False,
            This parameter specifies whether the mapping of the embeddings nodes
            matches the internal node mapping of the given graph.
            If these two mappings do not match, the generated edge embedding
            will be meaningless.
        support_mirrored_strategy: bool = False,
            Wethever to patch support for mirror strategy.
            At the time of writing, TensorFlow's MirrorStrategy does not support
            input values different from floats, therefore to support it we need
            to convert the unsigned int 32 values that represent the indices of
            the embedding layers we receive from Ensmallen to floats.
            This will generally slow down performance, but in the context of
            exploiting multiple GPUs it may be unnoticeable.
        """
        self._transformer = GraphTransformer(
            method=method,
            aligned_node_mapping=aligned_node_mapping,
            support_mirrored_strategy=support_mirrored_strategy
        )

Create new LinkPredictionTransformer object.

Parameters
  • method (str = "hadamard",): Method to use for the embedding. Can either be 'Hadamard', 'Sum', 'Average', 'L1', 'AbsoluteL1', 'L2' or 'Concatenate'.
  • aligned_node_mapping (bool = False,): This parameter specifies whether the mapping of the embeddings nodes matches the internal node mapping of the given graph. If these two mappings do not match, the generated edge embedding will be meaningless.
  • support_mirrored_strategy (bool = False,): Wethever to patch support for mirror strategy. At the time of writing, TensorFlow's MirrorStrategy does not support input values different from floats, therefore to support it we need to convert the unsigned int 32 values that represent the indices of the embedding layers we receive from Ensmallen to floats. This will generally slow down performance, but in the context of exploiting multiple GPUs it may be unnoticeable.
#   def fit(self, embedding: pandas.core.frame.DataFrame):
View Source
    def fit(self, embedding: pd.DataFrame):
        """Fit the model.

        Parameters
        -------------------------
        embedding: pd.DataFrame,
            Embedding to use to fit the transformer.
            This is a pandas DataFrame and NOT a numpy array because we need
            to be able to remap correctly the vector embeddings in case of
            graphs that do not respect the same internal node mapping but have
            the same node set. It is possible to remap such graphs using
            Ensmallen's remap method but it may be less intuitive to users.
        """
        self._transformer.fit(embedding)

Fit the model.

Parameters
  • embedding (pd.DataFrame,): Embedding to use to fit the transformer. This is a pandas DataFrame and NOT a numpy array because we need to be able to remap correctly the vector embeddings in case of graphs that do not respect the same internal node mapping but have the same node set. It is possible to remap such graphs using Ensmallen's remap method but it may be less intuitive to users.
#   def transform( self, positive_graph: Union[Graph, numpy.ndarray, List[List[str]], List[List[int]]], negative_graph: Union[Graph, numpy.ndarray, List[List[str]], List[List[int]]], random_state: int = 42 ) -> Tuple[numpy.ndarray, numpy.ndarray]:
View Source
    def transform(
        self,
        positive_graph: Union[Graph, np.ndarray, List[List[str]], List[List[int]]],
        negative_graph: Union[Graph, np.ndarray, List[List[str]], List[List[int]]],
        random_state: int = 42
    ) -> Tuple[np.ndarray, np.ndarray]:
        """Return edge embedding for given graph using provided method.

        Parameters
        --------------------------
        positive_graph: Union[Graph, List[List[str]], List[List[int]]],
            The graph whose edges are to be embedded and labeled as positives.
            It can either be an Graph or a list of lists of edges.
        negative_graph: Union[Graph, List[List[str]], List[List[int]]],
            The graph whose edges are to be embedded and labeled as positives.
            It can either be an Graph or a list of lists of edges.
        random_state: int = 42,
            The random state to use to shuffle the labels.

        Raises
        --------------------------
        ValueError,
            If embedding is not fitted.

        Returns
        --------------------------
        Tuple with X and y values.
        """
        positive_edge_embedding = self._transformer.transform(positive_graph)
        negative_edge_embedding = self._transformer.transform(negative_graph)
        edge_embeddings = np.vstack([
            positive_edge_embedding,
            negative_edge_embedding
        ])
        edge_labels = np.concatenate([
            np.ones(positive_edge_embedding.shape[0]),
            np.zeros(negative_edge_embedding.shape[0])
        ])
        numpy_random_state = np.random.RandomState(  # pylint: disable=no-member
            seed=random_state
        )
        indices = numpy_random_state.permutation(edge_labels.size)

        return edge_embeddings[indices], edge_labels[indices]

Return edge embedding for given graph using provided method.

Parameters
  • positive_graph (Union[Graph, List[List[str]], List[List[int]]],): The graph whose edges are to be embedded and labeled as positives. It can either be an Graph or a list of lists of edges.
  • negative_graph (Union[Graph, List[List[str]], List[List[int]]],): The graph whose edges are to be embedded and labeled as positives. It can either be an Graph or a list of lists of edges.
  • random_state (int = 42,): The random state to use to shuffle the labels.
Raises
  • ValueError,: If embedding is not fitted.
Returns
  • Tuple with X and y values.
#   class GraphVisualization:
View Source
class GraphVisualization:
    """Tools to visualize the graph embeddings."""

    DEFAULT_SCATTER_KWARGS = dict(
        s=5,
        alpha=0.7
    )
    DEFAULT_SUBPLOT_KWARGS = dict(
        figsize=(7, 7),
        dpi=200
    )

    def __init__(
        self,
        graph: Graph,
        decomposition_method: str = "TSNE",
        scaler_method: "Scaler" = RobustScaler,
        n_components: int = 2,
        node_embedding_method_name: str = None,
        edge_embedding_method: str = "Hadamard",
        subsample_points: int = 20_000,
        random_state: int = 42,
        decomposition_kwargs: Dict = None
    ):
        """Create new GraphVisualization object.

        Parameters
        --------------------------
        graph: Graph,
            The graph to visualize.
        decomposition_method: str = "TSNE",
            The decomposition method to use.
            The supported methods are TSNE and PCA.
        scaler_method: "Scaler" = RobustScaler,
            The scaler object to use to normalize the embedding.
            By default we use a Robust Scaler.
            Pass None to not use any scaler.
        n_components: int = 2,
            Number of components to reduce the image to.
            Currently, we only support 2D decompositions but we plan
            to add support for also 3D decompositions.
        node_embedding_method_name: str = None,
            Name of the node embedding method used.
            If provided, it is added to the images titles.
        edge_embedding_method: str = "Hadamard",
            Edge embedding method.
            Can either be 'Hadamard', 'Sum', 'Average', 'L1', 'AbsoluteL1', 'L2' or 'Concatenate'.
        subsample_points: int = 20_000,
            Number of points to subsample.
            Some graphs have a number of nodes and edges in the millions.
            Using non-CUDA versions of TSNE, the dimensionality reduction
            procedure can take a considerable amount of time.
            For this porpose, we include the possibility to subsample the
            points to the given number.
            The subsampling is done in a way that takes into consideration
            the node types and/or edge types (the subsampling is applied
            separately to the two different sets) by using a Stratified Shuffle
            Split if there are node types or edge types.
            Otherwise, a normal train test split is used.
            If None is given, no subsampling is executed.
        random_state: int = 42,
            The random state to reproduce the visualizations.
        decomposition_kwargs: Dict = None,
            Kwargs to forward to the selected decomposition method.

        Raises
        ---------------------------
        ValueError,
            If the target decomposition size is not supported.
        ModuleNotFoundError,
            If TSNE decomposition has been required and no module supporting
            it is installed.
        """
        self._graph = graph
        self._graph_transformer = GraphTransformer(
            method=edge_embedding_method
        )
        self._node_transformer = NodeTransformer()
        self._node_embedding_method_name = node_embedding_method_name
        self._node_mapping = self._node_embedding = self._edge_embedding = None
        self._subsampled_node_ids = None
        self._subsampled_edge_ids = None
        self._subsample_points = subsample_points
        self._random_state = random_state

        if decomposition_kwargs is None:
            decomposition_kwargs = {}

        if n_components not in {2, 3}:
            raise ValueError(
                "We currently only support 2D and 3D decomposition visualization."
            )

        self._n_components = n_components
        self._scaler_method = None if scaler_method is None else scaler_method()

        if decomposition_method == "TSNE":
            try:
                # We try to use CUDA tsne if available, but this does not
                # currently support 3D decomposition. If the user has required a
                # 3D decomposition we need to switch to the MulticoreTSNE version.
                # Additionally, in the case that the desired decomposition
                # uses some not available parameters, such as a cosine distance
                # metric, we will capture that use case as a NotImplementedError.
                if n_components != 2:
                    raise NotImplementedError()
                from tsnecuda import TSNE as CUDATSNE  # pylint: disable=import-error,import-outside-toplevel
                self._decomposition_method = CUDATSNE(
                    n_components=2,
                    random_seed=random_state,
                    verbose=True,
                    **decomposition_kwargs
                )
            except (ModuleNotFoundError, NotImplementedError):
                try:
                    from MulticoreTSNE import \
                        MulticoreTSNE  # pylint: disable=import-outside-toplevel
                    self._decomposition_method = MulticoreTSNE(
                        n_components=n_components,
                        n_jobs=cpu_count(),
                        random_state=random_state,
                        verbose=True,
                        **decomposition_kwargs
                    )
                except ModuleNotFoundError:
                    try:
                        from sklearn.manifold import \
                            TSNE  # pylint: disable=import-outside-toplevel
                        self._decomposition_method = TSNE(
                            n_components=n_components,
                            n_jobs=cpu_count(),
                            random_state=random_state,
                            verbose=True,
                            **decomposition_kwargs
                        )
                    except:
                        raise ModuleNotFoundError(
                            "You do not have installed a supported TSNE "
                            "decomposition algorithm. Depending on your use case, "
                            "we suggest you install tsne-cuda if your graph is "
                            "very big (in the millions of nodes) if you have access "
                            "to a compatible GPU system.\n"
                            "Alternatively, we suggest (and support) MulticoreTSNE, "
                            "which tends to be easier to install, and is significantly "
                            "faster than the Sklearn implementation.\n"
                            "Alternatively, we suggest (and support) MulticoreTSNE, "
                            "which tends to be easier to install, and is significantly "
                            "faster than the Sklearn implementation.\n"
                            "If you intend to do 3D decompositions, "
                            "remember that tsne-cuda, at the time of writing, "
                            "does not support them."
                        )
        elif decomposition_method == "PCA":
            self._decomposition_method = PCA(
                n_components=n_components,
                random_state=random_state,
                **decomposition_kwargs
            )
        else:
            raise ValueError(
                "We currently only support PCA and TSNE decomposition methods."
            )

    def decompose(self, X: np.ndarray) -> np.ndarray:
        """Return requested decomposition of given array.

        Parameters
        -----------------------
        X: np.ndarray,
            The data to embed.

        Raises
        -----------------------
        ValueError,
            If the given vector has less components than the required
            decomposition target.

        Returns
        -----------------------
        The obtained decomposition.
        """
        if X.shape[1] == self._n_components:
            return X
        if X.shape[1] < self._n_components:
            raise ValueError(
                "The vector to decompose has less components than "
                "the decomposition target."
            )
        return self._decomposition_method.fit_transform(X)

    def _shuffle(
        self,
        *args: List[Union[np.ndarray, pd.DataFrame, None]]
    ) -> List[np.ndarray]:
        """Return given arrays shuffled synchronously.

        The reason to shuffle the points is mainly that this avoids for
        'fake' clusters to appear simply by stacking the points by class
        artifically according to how the points are sorted.

        Parameters
        ------------------------
        *args: List[Union[np.ndarray, pd.DataFrame, None]],
            The lists to shuffle.

        Returns
        ------------------------
        Shuffled data using given random state.
        """
        index = np.arange(args[0].shape[0])
        random_state = np.random.RandomState(  # pylint: disable=no-member
            seed=self._random_state
        )
        random_state.shuffle(index)
        return [
            arg[index] if isinstance(arg, np.ndarray)
            else arg.iloc[index] if isinstance(arg, pd.DataFrame)
            else None
            for arg in args
        ]

    def _set_legend(
        self,
        axes: Axes,
        labels: List[str],
        handles: List[HandlerBase],
        legend_title: str
    ):
        """Set the legend with the given values and handles transparency.

        Parameters
        ----------------------------
        axes: Axes,
            The axes on which to put the legend.
        labels: List[str],
            Labels to put in the legend.
        handles: List,
            Handles to display in the legend (the curresponding matplotlib
            objects).
        legend_title: str,
            Title for the legend.
        """
        legend = axes.legend(
            handles=handles,
            labels=sanitize_ml_labels(labels),
            loc='best',
            title=legend_title,
            **(
                dict(handler_map={tuple: HandlerTuple(ndivide=None)})
                if isinstance(handles[0], tuple)
                else {}
            )
        )
        # Setting alpha level in the legend to avoid having a transparent
        # legend scatter dots.
        for legend_handle in legend.legendHandles:
            legend_handle._legmarker.set_alpha(  # pylint: disable=protected-access
                1
            )

    def fit_transform_nodes(
        self,
        node_embedding: pd.DataFrame
    ):
        """Executes fitting for plotting node embeddings.

        Parameters
        -------------------------
        node_embedding: pd.DataFrame,
            Embedding of the graph nodes.
        """
        # Retrieve the nodes
        node_names = np.array(self._graph.get_node_names())
        # If necessary, we proceed with the subsampling
        if self._subsample_points is not None and self._graph.get_number_of_nodes() > self._subsample_points:
            # If there are node types, we use a stratified
            # node sampling so that all the nodes types may be displayed.
            if self._graph.has_node_types() and not self._graph.has_singleton_node_types():
                Splitter = StratifiedShuffleSplit
            else:
                # Otherwise there is no need to stratify.
                Splitter = ShuffleSplit
            # We compute the indices
            self._subsampled_node_ids, _ = next(Splitter(
                n_splits=1,
                train_size=self._subsample_points,
                random_state=self._random_state
            ).split(node_names, self._flatten_multi_label_and_unknown_node_types()))
            # And sample the nodes
            node_names = node_names[self._subsampled_node_ids]

        if self._scaler_method is not None:
            node_embedding = pd.DataFrame(
                self._scaler_method.fit_transform(node_embedding),
                columns=node_embedding.columns,
                index=node_embedding.index,
            )
        self._node_transformer.fit(node_embedding)
        self._node_embedding = pd.DataFrame(
            self.decompose(
                self._node_transformer.transform(node_names)
            ),
            index=node_names
        )

    def fit_transform_edges(
        self,
        node_embedding: Optional[pd.DataFrame] = None,
        edge_embedding: Optional[pd.DataFrame] = None,
    ):
        """Executes fitting for plotting edge embeddings.

        Parameters
        -------------------------
        node_embedding: Optional[pd.DataFrame] = None,
            Node embedding obtained from SkipGram, CBOW or GloVe or others.
        node_embedding: Optional[pd.DataFrame] = None,
            Edge embedding.

        Raises
        -------------------------
        ValueError,
            If neither the node embedding nor the edge embedding have
            been provided. You need to provide exactly one of the two.
        ValueError,
            If the shape of the given node embedding does not match
            the number of nodes in the graph.
        ValueError,
            If the shape of the given node embedding does not match
            the number of edges in the graph.   
        """
        if node_embedding is None and edge_embedding is None:
            raise ValueError(
                "You need to provide either the node embedding or the "
                "edge embedding."
            )
        if node_embedding is not None and edge_embedding is not None:
            raise ValueError(
                "You need to provide either the node embedding or the "
                "edge embedding. You cannot provide both at once."
            )
        if node_embedding is not None and node_embedding.shape[0] != self._graph.get_number_of_nodes():
            raise ValueError(
                ("The number of rows provided with the given node embedding {} "
                 "does not match the number of nodes in the graph {}.").format(
                    node_embedding.shape[0],
                    self._graph.get_number_of_nodes()
                )
            )
        if edge_embedding is not None and edge_embedding.shape[0] != self._graph.get_directed_edges_number():
            raise ValueError(
                ("The number of rows provided with the given edge embedding {} "
                 "does not match the number of directed edges in the graph {}.").format(
                    edge_embedding.shape[0],
                    self._graph.get_directed_edges_number()
                )
            )

        # Retrieve the edges
        edge_names = np.array(self._graph.get_edge_node_names(directed=True))
        # If necessary, we proceed with the subsampling
        if self._subsample_points is not None and len(edge_names) > self._subsample_points:
            # If there are edge types, we use a stratified
            # edge sampling so that all the edges types may be displayed.
            if self._graph.has_edge_types() and not self._graph.has_singleton_edge_types():
                Splitter = StratifiedShuffleSplit
            else:
                # Otherwise there is no need to stratify.
                Splitter = ShuffleSplit
            # We compute the indices
            self._subsampled_edge_ids, _ = next(Splitter(
                n_splits=1,
                train_size=self._subsample_points,
                random_state=self._random_state
            ).split(edge_names, self._flatten_unknown_edge_types()))
            # And sample the edges
            edge_names = edge_names[self._subsampled_edge_ids]
            if edge_embedding is not None:
                edge_embedding = edge_embedding[self._subsampled_edge_ids]

        if node_embedding is not None:
            if self._scaler_method is not None:
                node_embedding = pd.DataFrame(
                    self._scaler_method.fit_transform(node_embedding),
                    columns=node_embedding.columns,
                    index=node_embedding.index,
                )
            self._graph_transformer.fit(node_embedding)
            edge_embedding = self._graph_transformer.transform(edge_names)
        self._edge_embedding = pd.DataFrame(
            self.decompose(edge_embedding),
            index=edge_names
        )

    def _get_figure_and_axes(
        self,
        **kwargs: Dict
    ) -> Tuple[Figure, Axes]:
        """Return tuple with figure and axes built using provided kwargs and defaults."""
        if self._n_components == 2:
            figure, axes = plt.subplots(**{
                **GraphVisualization.DEFAULT_SUBPLOT_KWARGS,
                **kwargs
            })
        else:
            figure, axes = subplots_3d(**{
                **GraphVisualization.DEFAULT_SUBPLOT_KWARGS,
                **kwargs
            })
        return figure, axes

    def _plot_scatter(
        self,
        title: str,
        points: np.ndarray,
        colors: List[int] = None,
        edgecolors: List[int] = None,
        labels: List[str] = None,
        legend_title: str = "",
        show_title: bool = True,
        show_legend: bool = True,
        figure: Figure = None,
        axes: Axes = None,
        scatter_kwargs: Dict = None,
        train_indices: np.ndarray = None,
        test_indices: np.ndarray = None,
        train_marker: str = "o",
        test_marker: str = "X",
        **kwargs
    ) -> Tuple[Figure, Axes, Tuple[Collection]]:
        """Plot nodes of provided graph.

        Parameters
        ------------------------------
        title: str,
            Title to use for the plot.
        points: np.ndarray,
            Points to plot.
        colors: List[int] = None,
            List of the colors to use for the scatter plot.
        edgecolors: List[int] = None,
            List of the edge colors to use for the scatter plot.
        labels: List[str] = None,
            Labels for the different colors.
        legend_title: str = "",
            Title for the legend.
        show_title: bool = True,
            Whether to show the figure title.
        show_legend: bool = True,
            Whether to show the legend.
        figure: Figure = None,
            Figure to use to plot. If None, a new one is created using the
            provided kwargs.
        axes: Axes = None,
            Axes to use to plot. If None, a new one is created using the
            provided kwargs.
        scatter_kwargs: Dict = None,
            Kwargs to pass to the scatter plot call.
        train_indices: np.ndarray = None,
            Indices to draw using the training marker.
            If None, all points are drawn using the training marker.
        test_indices: np.ndarray = None,
            Indices to draw using the test marker.
            If None, while providing the train indices, we only plot the
            training points.
        train_marker: str = "o",
            The marker to use to draw the training points.
        test_marker: str = "X",
            The marker to use to draw the test points.
        **kwargs: Dict,
            Arguments to pass to the subplots.

        Raises
        ------------------------------
        ValueError,
            If given train and test indices overlap.

        Returns
        ------------------------------
        Figure and Axis of the plot, followed by tuple of collections.
        """
        if train_indices is not None and test_indices is not None:
            if np.isin(train_indices, test_indices).any():
                raise ValueError(
                    "The train and test indices overlap."
                )

        if figure is None or axes is None:
            figure, axes = self._get_figure_and_axes(**kwargs)

        scatter_kwargs = {
            **GraphVisualization.DEFAULT_SCATTER_KWARGS,
            **(
                dict(linewidths=0)
                if edgecolors is None
                else dict(linewidths=0.5)
            ),
            **({} if scatter_kwargs is None else scatter_kwargs),
        }

        train_test_mask = np.zeros((points.shape[0]))

        if train_indices is not None:
            train_test_mask[train_indices] = 1

        if test_indices is not None:
            train_test_mask[test_indices] = 2

        points, colors, edgecolors, train_test_mask = self._shuffle(
            points,
            colors,
            edgecolors,
            train_test_mask
        )

        legend_elements = []
        collections = []

        color_names = np.array([
            "tab:blue",
            "tab:orange",
            "tab:green",
            "tab:red",
            "tab:purple",
            "tab:brown",
            "tab:pink",
            "tab:gray",
            "tab:olive",
            "tab:cyan",
        ])

        if colors is not None:
            cmap = scatter_kwargs.pop(
                "cmap",
                ListedColormap(color_names[:int(colors.max() + 1)])
            )
        else:
            cmap = None

        if train_indices is None and test_indices is None:
            scatter = axes.scatter(
                *points.T,
                c=colors,
                edgecolors=None if edgecolors is None else cmap(edgecolors),
                marker=train_marker,
                cmap=cmap,
                **scatter_kwargs
            )
            collections.append(scatter)
            legend_elements += scatter.legend_elements()[0]

        if train_indices is not None:
            train_mask = train_test_mask == 1
            train_scatter = axes.scatter(
                *points[train_mask].T,
                c=colors[train_mask],
                edgecolors=None if edgecolors is None else cmap(
                    edgecolors[train_mask]
                ),
                marker=train_marker,
                cmap=cmap,
                **scatter_kwargs
            )
            collections.append(train_scatter)
            legend_elements.append(train_scatter.legend_elements()[0])

        if test_indices is not None:
            test_mask = train_test_mask == 2
            test_scatter = axes.scatter(
                *points[test_mask].T,
                c=colors[test_mask],
                edgecolors=None if edgecolors is None else cmap(
                    edgecolors[test_mask]),
                marker=test_marker,
                cmap=cmap,
                **scatter_kwargs
            )
            collections.append(test_scatter)
            legend_elements.append(test_scatter.legend_elements()[0])

        rectangle_to_fill_legend = matplotlib.patches.Rectangle(
            (0, 0), 1, 1,
            fill=False,
            edgecolor='none',
            visible=False
        )

        if all(
            e is not None
            for e in (colors, train_indices, test_indices, labels)
        ):
            unique_train_colors = np.unique(colors[train_mask])
            unique_test_colors = np.unique(colors[test_mask])
            new_legend_elements = []
            train_element_index = 0
            test_element_index = 0
            for color in np.unique(colors):
                new_tuple = []
                if color in unique_train_colors:
                    new_tuple.append(legend_elements[0][train_element_index])
                    train_element_index += 1
                else:
                    new_tuple.append(rectangle_to_fill_legend)
                if color in unique_test_colors:
                    new_tuple.append(legend_elements[1][test_element_index])
                    test_element_index += 1
                else:
                    new_tuple.append(rectangle_to_fill_legend)

                new_legend_elements.append(tuple(new_tuple))
            legend_elements = new_legend_elements

        if show_legend and labels is not None:
            self._set_legend(
                axes,
                labels,
                legend_elements,
                legend_title
            )

        if self._n_components == 2:
            axes.set_axis_off()

        title = "{} - {}".format(
            title,
            self._graph.get_name(),
        )

        if self._node_embedding_method_name is not None:
            title = "{} - {}".format(
                title,
                self._node_embedding_method_name
            )

        if show_title:
            axes.set_title(title)
        figure.tight_layout()

        return figure, axes, collections

    def _plot_types(
        self,
        title: str,
        points: np.ndarray,
        types: List[int],
        type_labels: List[str],
        legend_title: str,
        show_title: bool = True,
        show_legend: bool = True,
        predictions: List[int] = None,
        k: int = 9,
        figure: Figure = None,
        axes: Axes = None,
        scatter_kwargs: Dict = None,
        other_label: str = "Other",
        train_indices: np.ndarray = None,
        test_indices: np.ndarray = None,
        train_marker: str = "o",
        test_marker: str = "X",
        **kwargs
    ) -> Tuple[Figure, Axes]:
        """Plot common node types of provided graph.

        Parameters
        ------------------------------
        title: str,
            Title to use for the plot.
        points: np.ndarray,
            Points to plot.
        types: List[int],
            Types of the provided points.
        type_labels: List[str],
            List of the labels for the provided types.
        legend_title: str,
            Title for the legend.
        show_title: bool = True,
            Whether to show the figure title.
        show_legend: bool = True,
            Whether to show the legend.
        predictions: List[int] = None,
            List of the labels predicted.
            If None, no prediction is visualized.
        k: int = 9,
            Number of node types to visualize.
        figure: Figure = None,
            Figure to use to plot. If None, a new one is created using the
            provided kwargs.
        axes: Axes = None,
            Axes to use to plot. If None, a new one is created using the
            provided kwargs.
        scatter_kwargs: Dict = None,
            Kwargs to pass to the scatter plot call.
        other_label: str = "Other",
            Label to use for edges below the top k threshold.
        train_indices: np.ndarray = None,
            Indices to draw using the training marker.
            If None, all points are drawn using the training marker.
        test_indices: np.ndarray = None,
            Indices to draw using the test marker.
            If None, while providing the train indices,
        train_marker: str = "o",
            The marker to use to draw the training points.
        test_marker: str = "X",
            The marker to use to draw the test points.
        **kwargs: Dict,
            Arguments to pass to the subplots.

        Raises
        ------------------------------
        ValueError,
            If edge fitting was not yet executed.
        ValueError,
            If given k is greater than maximum supported value (10).
        ValueError,
            If the number of given type labels does not match the number
            of given type counts.

        Returns
        ------------------------------
        Figure and Axis of the plot.
        """
        if k > 9:
            raise ValueError(
                "Values of k greater than 9 are not supported!"
            )

        # if not isinstance(types, np.ndarray):
        #     raise ValueError(
        #         "Expecting types to be a numpy array."
        #     )
        types = np.array(types)

        number_of_types = np.unique(types).size
        type_labels = np.array(type_labels)

        counts = np.bincount(types, minlength=number_of_types)
        top_counts = [
            index
            for index, _ in sorted(
                enumerate(zip(counts, type_labels)),
                key=lambda x: x[1],
                reverse=True
            )[:k]
        ]

        type_labels = list(type_labels[top_counts])

        for i, element_type in enumerate(types):
            if element_type not in top_counts:
                types[i] = k
            else:
                types[i] = top_counts.index(element_type)

        if predictions is not None:
            predictions = predictions.copy()
            for i, element_type in enumerate(predictions):
                if element_type not in top_counts:
                    predictions[i] = k
                else:
                    predictions[i] = top_counts.index(element_type)

        if k < number_of_types:
            type_labels.append(other_label)

        figure, axis, _ = self._plot_scatter(
            title=title,
            points=points,
            colors=types,
            edgecolors=predictions,
            labels=type_labels,
            legend_title=legend_title,
            show_title=show_title,
            show_legend=show_legend,
            figure=figure,
            axes=axes,
            scatter_kwargs=scatter_kwargs,
            train_indices=train_indices,
            test_indices=test_indices,
            train_marker=train_marker,
            test_marker=test_marker,
            **kwargs
        )

        return figure, axis

    def plot_edge_segments(
        self,
        points: np.ndarray,
        figure: Figure = None,
        axes: Axes = None,
        **kwargs: Dict
    ) -> Tuple[Figure, Axes]:
        if figure is None or axes is None:
            figure, axes = self._get_figure_and_axes(**kwargs)

        if self._subsampled_node_ids is not None:
            edge_node_ids = self._graph.get_edge_ids_from_node_ids(
                node_ids=self._subsampled_node_ids,
                add_selfloops_where_missing=False,
                complete=False,
            )
        else:
            edge_node_ids = self._graph.get_edge_node_ids(
                directed=False
            )

        lines_collection = mc.LineCollection(
            points[edge_node_ids],
            linewidths=1,
            zorder=0
        )
        axes.add_collection(lines_collection)

        return figure, axes

    def plot_nodes(
        self,
        figure: Figure = None,
        axes: Axes = None,
        scatter_kwargs: Dict = None,
        train_indices: np.ndarray = None,
        test_indices: np.ndarray = None,
        train_marker: str = "o",
        test_marker: str = "X",
        show_title: bool = True,
        show_legend: bool = True,
        annotate_nodes: Union[str, bool] = "auto",
        show_edges: bool = False,
        **kwargs: Dict
    ) -> Tuple[Figure, Axes]:
        """Plot nodes of provided graph.

        Parameters
        ------------------------------
        figure: Figure = None,
            Figure to use to plot. If None, a new one is created using the
            provided kwargs.
        axes: Axes = None,
            Axes to use to plot. If None, a new one is created using the
            provided kwargs.
        scatter_kwargs: Dict = None,
            Kwargs to pass to the scatter plot call.
        train_indices: np.ndarray = None,
            Indices to draw using the training marker.
            If None, all points are drawn using the training marker.
        test_indices: np.ndarray = None,
            Indices to draw using the test marker.
            If None, while providing the train indices,
        train_marker: str = "o",
            The marker to use to draw the training points.
        test_marker: str = "X",
            The marker to use to draw the test points.
        show_title: bool = True,
            Whether to show the figure title.
        show_legend: bool = True,
            Whether to show the legend.
        annotate_nodes: Union[str, bool] = "auto",
            Whether to show the node name when scattering them.
            The default behaviour, "auto", means that it will
            enable this feature automatically when the graph has
            less than 100 nodes.
        **kwargs: Dict,
            Arguments to pass to the subplots.

        Raises
        ------------------------------
        ValueError,
            If edge fitting was not yet executed.

        Returns
        ------------------------------
        Figure and Axis of the plot.
        """
        if self._node_embedding is None:
            raise ValueError(
                "Node fitting must be executed before plot."
            )

        if annotate_nodes == "auto":
            annotate_nodes = self._graph.get_number_of_nodes() < 100

        if show_edges:
            figure, axes = self.plot_edge_segments(
                self._node_embedding.values,
                figure,
                axes,
                **kwargs
            )

        figure, axes, _ = self._plot_scatter(
            "Nodes embedding",
            self._node_embedding.values,
            figure=figure,
            axes=axes,
            scatter_kwargs=scatter_kwargs,
            train_indices=train_indices,
            test_indices=test_indices,
            train_marker=train_marker,
            test_marker=test_marker,
            show_title=show_title,
            show_legend=show_legend,
            **kwargs
        )

        if annotate_nodes:
            figure, axes = self.annotate_nodes(
                figure=figure,
                axes=axes,
                points=self._node_embedding.values,
            )

        return figure, axes

    def annotate_nodes(
        self,
        figure: Figure,
        axes: Axes,
        points: np.ndarray
    ) -> Tuple[Figure, Axes]:
        if self._subsampled_node_ids is not None:
            node_names = [
                self._graph.get_node_name_from_node_id(node_id)
                for node_id in self._subsampled_node_ids
            ]
        else:
            node_names = self._graph.get_node_names()
        for i, txt in enumerate(node_names):
            axes.annotate(txt, points[i])
        return (figure, axes)

    def plot_edges(
        self,
        figure: Figure = None,
        axes: Axes = None,
        scatter_kwargs: Dict = None,
        train_indices: np.ndarray = None,
        test_indices: np.ndarray = None,
        train_marker: str = "o",
        test_marker: str = "X",
        show_title: bool = True,
        show_legend: bool = True,
        **kwargs: Dict
    ) -> Tuple[Figure, Axes]:
        """Plot edge embedding of provided graph.

        Parameters
        ------------------------------
        figure: Figure = None,
            Figure to use to plot. If None, a new one is created using the
            provided kwargs.
        axes: Axes = None,
            Axes to use to plot. If None, a new one is created using the
            provided kwargs.
        scatter_kwargs: Dict = None,
            Kwargs to pass to the scatter plot call.
        train_indices: np.ndarray = None,
            Indices to draw using the training marker.
            If None, all points are drawn using the training marker.
        test_indices: np.ndarray = None,
            Indices to draw using the test marker.
            If None, while providing the train indices,
        train_marker: str = "o",
            The marker to use to draw the training points.
        test_marker: str = "X",
            The marker to use to draw the test points.
        show_title: bool = True,
            Whether to show the figure title.
        show_legend: bool = True,
            Whether to show the legend.
        **kwargs: Dict,
            Arguments to pass to the subplots.

        Raises
        ------------------------------
        ValueError,
            If edge fitting was not yet executed.

        Returns
        ------------------------------
        Figure and Axis of the plot.
        """
        if self._edge_embedding is None:
            raise ValueError(
                "Edge fitting must be executed before plot."
            )

        figure, axis, _ = self._plot_scatter(
            "Edges embedding",
            self._edge_embedding,
            figure=figure,
            axes=axes,
            scatter_kwargs=scatter_kwargs,
            train_indices=train_indices,
            test_indices=test_indices,
            train_marker=train_marker,
            test_marker=test_marker,
            show_title=show_title,
            show_legend=show_legend,
            **kwargs
        )

        return figure, axis

    def _flatten_multi_label_and_unknown_node_types(self) -> np.ndarray:
        # The following is needed to normalize the multiple types
        node_types_counts = self._graph.get_node_type_id_counts_hashmap()
        node_types_number = self._graph.get_node_types_number()
        # When we have multiple node types for a given node, we set it to
        # the most common node type of the set.
        return np.array([
            sorted(
                node_type_ids,
                key=lambda node_type: node_types_counts[node_type],
                reverse=True
            )[0]
            if node_type_ids is not None
            else
            node_types_number
            for node_type_ids in self._graph.get_node_type_ids()
        ])

    def _flatten_unknown_edge_types(self) -> np.ndarray:
        # The following is needed to normalize the multiple types
        edge_types_number = self._graph.get_edge_types_number()
        # When we have multiple node types for a given node, we set it to
        # the most common node type of the set.
        return np.array([
            edge_type_id
            if edge_type_id is not None
            else
            edge_types_number
            for edge_type_id in self._graph.get_edge_type_ids()
        ])

    def plot_node_types(
        self,
        node_type_predictions: List[int] = None,
        k: int = 9,
        figure: Figure = None,
        axes: Axes = None,
        scatter_kwargs: Dict = None,
        legend_title: str = "Node types",
        other_label: str = "Other",
        train_indices: np.ndarray = None,
        test_indices: np.ndarray = None,
        train_marker: str = "o",
        test_marker: str = "X",
        show_title: bool = True,
        show_legend: bool = True,
        show_edges: bool = False,
        annotate_nodes: Union[str, bool] = "auto",
        **kwargs
    ) -> Tuple[Figure, Axes]:
        """Plot common node types of provided graph.

        Parameters
        ------------------------------
        node_type_predictions: List[int] = None,
            Predictions of the node types.
        k: int = 9,
            Number of node types to visualize.
        figure: Figure = None,
            Figure to use to plot. If None, a new one is created using the
            provided kwargs.
        axes: Axes = None,
            Axes to use to plot. If None, a new one is created using the
            provided kwargs.
        scatter_kwargs: Dict = None,
            Kwargs to pass to the scatter plot call.
        other_label: str = "Other",
            Label to use for edges below the top k threshold.
        train_indices: np.ndarray = None,
            Indices to draw using the training marker.
            If None, all points are drawn using the training marker.
        test_indices: np.ndarray = None,
            Indices to draw using the test marker.
            If None, while providing the train indices,
        train_marker: str = "o",
            The marker to use to draw the training points.
        test_marker: str = "X",
            The marker to use to draw the test points.
        show_title: bool = True,
            Whether to show the figure title.
        show_legend: bool = True,
            Whether to show the legend.
        **kwargs: Dict,
            Arguments to pass to the subplots.

        Raises
        ------------------------------
        ValueError,
            If edge fitting was not yet executed.
        ValueError,
            If given k is greater than maximum supported value (10).

        Returns
        ------------------------------
        Figure and Axis of the plot.
        """
        if not self._graph.has_node_types():
            raise ValueError(
                "The graph does not have node types!"
            )

        if self._node_embedding is None:
            raise ValueError(
                "Node fitting must be executed before plot."
            )

        if show_edges:
            figure, axes = self.plot_edge_segments(
                self._node_embedding.values,
                figure,
                axes,
                **kwargs
            )

        if annotate_nodes == "auto":
            annotate_nodes = self._graph.get_number_of_nodes() < 100

        node_types = self._flatten_multi_label_and_unknown_node_types()
        if self._subsampled_node_ids is not None:
            node_types = node_types[self._subsampled_node_ids]

        node_type_names = self._graph.get_unique_node_type_names()

        if self._graph.has_unknown_node_types():
            node_type_names.append("Unknown")

        node_type_names = np.array(node_type_names)

        figure, axes = self._plot_types(
            "Node types",
            self._node_embedding.values,
            types=node_types,
            type_labels=node_type_names,
            legend_title=legend_title,
            predictions=node_type_predictions,
            k=k,
            figure=figure,
            axes=axes,
            scatter_kwargs=scatter_kwargs,
            other_label=other_label,
            train_indices=train_indices,
            test_indices=test_indices,
            train_marker=train_marker,
            test_marker=test_marker,
            show_title=show_title,
            show_legend=show_legend,
            **kwargs
        )

        if annotate_nodes:
            figure, axes = self.annotate_nodes(
                figure=figure,
                axes=axes,
                points=self._node_embedding.values,
            )

        return figure, axes

    def plot_connected_components(
        self,
        k: int = 9,
        figure: Figure = None,
        axes: Axes = None,
        scatter_kwargs: Dict = None,
        other_label: str = "Other",
        legend_title: str = "Component sizes",
        train_indices: np.ndarray = None,
        test_indices: np.ndarray = None,
        train_marker: str = "o",
        test_marker: str = "X",
        show_title: bool = True,
        show_legend: bool = True,
        show_edges: bool = False,
        annotate_nodes: Union[str, bool] = "auto",
        **kwargs
    ) -> Tuple[Figure, Axes]:
        """Plot common node types of provided graph.

        Parameters
        ------------------------------
        k: int = 9,
            Number of components to visualize.
        figure: Figure = None,
            Figure to use to plot. If None, a new one is created using the
            provided kwargs.
        axes: Axes = None,
            Axes to use to plot. If None, a new one is created using the
            provided kwargs.
        scatter_kwargs: Dict = None,
            Kwargs to pass to the scatter plot call.
        other_label: str = "Other",
            Label to use for edges below the top k threshold.
        legend_title: str = "Component sizes",
            Title for the legend.
        train_indices: np.ndarray = None,
            Indices to draw using the training marker.
            If None, all points are drawn using the training marker.
        test_indices: np.ndarray = None,
            Indices to draw using the test marker.
            If None, while providing the train indices,
        train_marker: str = "o",
            The marker to use to draw the training points.
        test_marker: str = "X",
            The marker to use to draw the test points.
        show_title: bool = True,
            Whether to show the figure title.
        show_legend: bool = True,
            Whether to show the legend.
        **kwargs: Dict,
            Arguments to pass to the subplots.

        Raises
        ------------------------------
        ValueError,
            If edge fitting was not yet executed.
        ValueError,
            If given k is greater than maximum supported value (10).

        Returns
        ------------------------------
        Figure and Axis of the plot.
        """
        if self._node_embedding is None:
            raise ValueError(
                "Node fitting must be executed before plot."
            )

        if show_edges:
            figure, axes = self.plot_edge_segments(
                self._node_embedding.values,
                figure,
                axes,
                **kwargs
            )

        if annotate_nodes == "auto":
            annotate_nodes = self._graph.get_number_of_nodes() < 100

        components, components_number, _, _ = self._graph.connected_components()
        sizes = np.bincount(components, minlength=components_number)

        if self._subsampled_node_ids is not None:
            components = components[self._subsampled_node_ids]

        figure, axes = self._plot_types(
            "Components",
            self._node_embedding.values,
            types=components,
            type_labels=np.array([
                "Size {}".format(size)
                for size in sizes
            ]),
            legend_title=legend_title,
            show_title=show_title,
            show_legend=show_legend,
            k=k,
            figure=figure,
            axes=axes,
            scatter_kwargs=scatter_kwargs,
            other_label=other_label,
            train_indices=train_indices,
            test_indices=test_indices,
            train_marker=train_marker,
            test_marker=test_marker,
            **kwargs
        )

        if annotate_nodes:
            figure, axes = self.annotate_nodes(
                figure=figure,
                axes=axes,
                points=self._node_embedding.values,
            )

        return figure, axes

    def plot_node_degrees(
        self,
        figure: Figure = None,
        axes: Axes = None,
        scatter_kwargs: Dict = None,
        train_indices: np.ndarray = None,
        test_indices: np.ndarray = None,
        train_marker: str = "o",
        test_marker: str = "X",
        use_log_scale: bool = True,
        show_title: bool = True,
        show_legend: bool = True,
        show_edges: bool = False,
        annotate_nodes: Union[str, bool] = "auto",
        **kwargs: Dict
    ):
        """Plot node degrees heatmap.

        Parameters
        ------------------------------
        figure: Figure = None,
            Figure to use to plot. If None, a new one is created using the
            provided kwargs.
        axes: Axes = None,
            Axes to use to plot. If None, a new one is created using the
            provided kwargs.
        scatter_kwargs: Dict = None,
            Kwargs to pass to the scatter plot call.
        train_indices: np.ndarray = None,
            Indices to draw using the training marker.
            If None, all points are drawn using the training marker.
        test_indices: np.ndarray = None,
            Indices to draw using the test marker.
            If None, while providing the train indices, 
        train_marker: str = "o",
            The marker to use to draw the training points.
        test_marker: str = "X",
            The marker to use to draw the test points.
        use_log_scale: bool = True,
            Whether to use log scale.
        show_title: bool = True,
            Whether to show the figure title.
        show_legend: bool = True,
            Whether to show the legend.
        **kwargs: Dict,
            Additional kwargs for the subplots.

        Raises
        ------------------------------
        ValueError,
            If edge fitting was not yet executed.

        Returns
        ------------------------------
        Figure and Axis of the plot.
        """
        if self._node_embedding is None:
            raise ValueError(
                "Node fitting must be executed before plot."
            )

        degrees = self._graph.get_node_degrees()
        if self._subsampled_node_ids is not None:
            degrees = degrees[self._subsampled_node_ids]

        if annotate_nodes == "auto":
            annotate_nodes = self._graph.get_number_of_nodes() < 100

        if show_edges:
            figure, axes = self.plot_edge_segments(
                self._node_embedding.values,
                figure,
                axes,
                **kwargs
            )

        figure, axes, scatter = self._plot_scatter(
            "Node degrees",
            self._node_embedding.values,
            colors=degrees,
            figure=figure,
            axes=axes,
            scatter_kwargs={
                **({} if scatter_kwargs is None else scatter_kwargs),
                "cmap": plt.cm.get_cmap('RdYlBu'),
                **({"norm": LogNorm()} if use_log_scale else {})
            },
            train_indices=train_indices,
            test_indices=test_indices,
            train_marker=train_marker,
            test_marker=test_marker,
            show_title=show_title,
            show_legend=show_legend,
            **kwargs
        )

        color_bar = figure.colorbar(scatter[0], ax=axes)
        color_bar.set_alpha(1)
        color_bar.draw_all()

        if annotate_nodes:
            figure, axes = self.annotate_nodes(
                figure=figure,
                axes=axes,
                points=self._node_embedding.values,
            )

        return figure, axes

    def plot_edge_types(
        self,
        edge_type_predictions: List[int] = None,
        k: int = 9,
        figure: Figure = None,
        axes: Axes = None,
        scatter_kwargs: Dict = None,
        other_label: str = "Other",
        legend_title: str = "Edge types",
        train_indices: np.ndarray = None,
        test_indices: np.ndarray = None,
        train_marker: str = "o",
        test_marker: str = "X",
        show_title: bool = True,
        show_legend: bool = True,
        **kwargs: Dict
    ):
        """Plot common edge types of provided graph.

        Parameters
        ------------------------------
        edge_type_predictions: List[int] = None,
            Predictions of the edge types.
        k: int = 9,
            Number of edge types to visualize.
        figure: Figure = None,
            Figure to use to plot. If None, a new one is created using the
            provided kwargs.
        axes: Axes = None,
            Axes to use to plot. If None, a new one is created using the
            provided kwargs.
        scatter_kwargs: Dict = None,
            Kwargs to pass to the scatter plot call.
        other_label: str = "Other",
            Label to use for edges below the top k threshold.
        legend_title: str = "Edge types",
            Title for the legend.
        train_indices: np.ndarray = None,
            Indices to draw using the training marker.
            If None, all points are drawn using the training marker.
        test_indices: np.ndarray = None,
            Indices to draw using the test marker.
            If None, while providing the train indices,
        train_marker: str = "o",
            The marker to use to draw the training points.
        test_marker: str = "X",
            The marker to use to draw the test points.
        show_title: bool = True,
            Whether to show the figure title.
        show_legend: bool = True,
            Whether to show the legend.
        **kwargs: Dict,
            Additional kwargs for the subplots.

        Raises
        ------------------------------
        ValueError,
            If the graph does not have edge types.
        ValueError,
            If edge fitting was not yet executed.
        ValueError,
            If given k is greater than maximum supported value (10).

        Returns
        ------------------------------
        Figure and Axis of the plot.
        """
        if not self._graph.has_edge_types():
            raise ValueError(
                "The graph does not have edge types!"
            )

        if self._edge_embedding is None:
            raise ValueError(
                "Edge fitting was not yet executed!"
            )

        edge_type_number = self._graph.get_edge_types_number()
        edge_types = np.array([
            edge_type_id
            if edge_type_id is not None
            else edge_type_number
            for edge_type_id in self._graph.get_edge_type_ids()
        ])

        if self._subsampled_edge_ids is not None:
            edge_types = edge_types[self._subsampled_edge_ids]

        edge_type_names = self._graph.get_unique_edge_type_names()

        if self._graph.has_unknown_edge_types():
            edge_type_names.append("Unknown")

        edge_type_names = np.array(edge_type_names)

        return self._plot_types(
            "Edge types",
            self._edge_embedding.values,
            types=edge_types,
            type_labels=edge_type_names,
            legend_title=legend_title,
            predictions=edge_type_predictions,
            k=k,
            figure=figure,
            axes=axes,
            scatter_kwargs=scatter_kwargs,
            other_label=other_label,
            train_indices=train_indices,
            test_indices=test_indices,
            train_marker=train_marker,
            test_marker=test_marker,
            show_title=show_title,
            show_legend=show_legend,
            **kwargs
        )

    def plot_edge_weights(
        self,
        figure: Figure = None,
        axes: Axes = None,
        scatter_kwargs: Dict = None,
        train_indices: np.ndarray = None,
        test_indices: np.ndarray = None,
        train_marker: str = "o",
        test_marker: str = "X",
        show_title: bool = True,
        show_legend: bool = True,
        **kwargs: Dict
    ):
        """Plot common edge types of provided graph.

        Parameters
        ------------------------------
        figure: Figure = None,
            Figure to use to plot. If None, a new one is created using the
            provided kwargs.
        axes: Axes = None,
            Axes to use to plot. If None, a new one is created using the
            provided kwargs.
        scatter_kwargs: Dict = None,
            Kwargs to pass to the scatter plot call.
        train_indices: np.ndarray = None,
            Indices to draw using the training marker.
            If None, all points are drawn using the training marker.
        test_indices: np.ndarray = None,
            Indices to draw using the test marker.
            If None, while providing the train indices, 
        train_marker: str = "o",
            The marker to use to draw the training points.
        test_marker: str = "X",
            The marker to use to draw the test points.
        show_title: bool = True,
            Whether to show the figure title.
        show_legend: bool = True,
            Whether to show the legend.
        **kwargs: Dict,
            Additional kwargs for the subplots.

        Raises
        ------------------------------
        ValueError,
            If edge fitting was not yet executed.

        Returns
        ------------------------------
        Figure and Axis of the plot.
        """
        if not self._graph.has_edge_weights():
            raise ValueError(
                "The graph does not have edge weights!"
            )

        if self._edge_embedding is None:
            raise ValueError(
                "Edge fitting must be executed before plot."
            )

        weights = self._graph.get_edge_weights()
        if self._subsampled_edge_ids is not None:
            weights = weights[self._subsampled_edge_ids]

        figure, axes, scatter = self._plot_scatter(
            "Edge weights",
            self._node_embedding.values,
            colors=weights,
            figure=figure,
            axes=axes,
            scatter_kwargs={
                **({} if scatter_kwargs is None else scatter_kwargs),
                "cmap": plt.cm.get_cmap('RdYlBu')
            },
            train_indices=train_indices,
            test_indices=test_indices,
            train_marker=train_marker,
            test_marker=test_marker,
            show_title=show_title,
            show_legend=show_legend,
            **kwargs
        )

        color_bar = figure.colorbar(scatter[0], ax=axes)
        color_bar.set_alpha(1)
        color_bar.draw_all()
        return figure, axes

    def plot_dot(self, engine: str = "circo"):
        """Return dot plot of the current graph.
        
        Parameters
        ------------------------------
        engine: str = "circo",
            The engine to use to visualize the graph.
        
        Raises
        ------------------------------
        ModuleNotFoundError,
            If graphviz is not installed.
        """
        try:
            import graphviz
        except ModuleNotFoundError:
            raise ModuleNotFoundError(
                "In order to run the graph Dot visualization, "
                "the graphviz library must be installed. This "
                "library is not an explicit dependency of "
                "Embiggen because it may be hard to install "
                "on some systems and cause the Embiggen library "
                "to fail the installation.\n"
                "In order to install graphviz, try running "
                "`pip install graphviz`."
            )
        return graphviz.Source(
            self._graph.to_dot(),
            engine=engine
        )

Tools to visualize the graph embeddings.

#   GraphVisualization( graph: Graph, decomposition_method: str = 'TSNE', scaler_method: 'Scaler' = <class 'sklearn.preprocessing._data.RobustScaler'>, n_components: int = 2, node_embedding_method_name: str = None, edge_embedding_method: str = 'Hadamard', subsample_points: int = 20000, random_state: int = 42, decomposition_kwargs: Dict = None )
View Source
    def __init__(
        self,
        graph: Graph,
        decomposition_method: str = "TSNE",
        scaler_method: "Scaler" = RobustScaler,
        n_components: int = 2,
        node_embedding_method_name: str = None,
        edge_embedding_method: str = "Hadamard",
        subsample_points: int = 20_000,
        random_state: int = 42,
        decomposition_kwargs: Dict = None
    ):
        """Create new GraphVisualization object.

        Parameters
        --------------------------
        graph: Graph,
            The graph to visualize.
        decomposition_method: str = "TSNE",
            The decomposition method to use.
            The supported methods are TSNE and PCA.
        scaler_method: "Scaler" = RobustScaler,
            The scaler object to use to normalize the embedding.
            By default we use a Robust Scaler.
            Pass None to not use any scaler.
        n_components: int = 2,
            Number of components to reduce the image to.
            Currently, we only support 2D decompositions but we plan
            to add support for also 3D decompositions.
        node_embedding_method_name: str = None,
            Name of the node embedding method used.
            If provided, it is added to the images titles.
        edge_embedding_method: str = "Hadamard",
            Edge embedding method.
            Can either be 'Hadamard', 'Sum', 'Average', 'L1', 'AbsoluteL1', 'L2' or 'Concatenate'.
        subsample_points: int = 20_000,
            Number of points to subsample.
            Some graphs have a number of nodes and edges in the millions.
            Using non-CUDA versions of TSNE, the dimensionality reduction
            procedure can take a considerable amount of time.
            For this porpose, we include the possibility to subsample the
            points to the given number.
            The subsampling is done in a way that takes into consideration
            the node types and/or edge types (the subsampling is applied
            separately to the two different sets) by using a Stratified Shuffle
            Split if there are node types or edge types.
            Otherwise, a normal train test split is used.
            If None is given, no subsampling is executed.
        random_state: int = 42,
            The random state to reproduce the visualizations.
        decomposition_kwargs: Dict = None,
            Kwargs to forward to the selected decomposition method.

        Raises
        ---------------------------
        ValueError,
            If the target decomposition size is not supported.
        ModuleNotFoundError,
            If TSNE decomposition has been required and no module supporting
            it is installed.
        """
        self._graph = graph
        self._graph_transformer = GraphTransformer(
            method=edge_embedding_method
        )
        self._node_transformer = NodeTransformer()
        self._node_embedding_method_name = node_embedding_method_name
        self._node_mapping = self._node_embedding = self._edge_embedding = None
        self._subsampled_node_ids = None
        self._subsampled_edge_ids = None
        self._subsample_points = subsample_points
        self._random_state = random_state

        if decomposition_kwargs is None:
            decomposition_kwargs = {}

        if n_components not in {2, 3}:
            raise ValueError(
                "We currently only support 2D and 3D decomposition visualization."
            )

        self._n_components = n_components
        self._scaler_method = None if scaler_method is None else scaler_method()

        if decomposition_method == "TSNE":
            try:
                # We try to use CUDA tsne if available, but this does not
                # currently support 3D decomposition. If the user has required a
                # 3D decomposition we need to switch to the MulticoreTSNE version.
                # Additionally, in the case that the desired decomposition
                # uses some not available parameters, such as a cosine distance
                # metric, we will capture that use case as a NotImplementedError.
                if n_components != 2:
                    raise NotImplementedError()
                from tsnecuda import TSNE as CUDATSNE  # pylint: disable=import-error,import-outside-toplevel
                self._decomposition_method = CUDATSNE(
                    n_components=2,
                    random_seed=random_state,
                    verbose=True,
                    **decomposition_kwargs
                )
            except (ModuleNotFoundError, NotImplementedError):
                try:
                    from MulticoreTSNE import \
                        MulticoreTSNE  # pylint: disable=import-outside-toplevel
                    self._decomposition_method = MulticoreTSNE(
                        n_components=n_components,
                        n_jobs=cpu_count(),
                        random_state=random_state,
                        verbose=True,
                        **decomposition_kwargs
                    )
                except ModuleNotFoundError:
                    try:
                        from sklearn.manifold import \
                            TSNE  # pylint: disable=import-outside-toplevel
                        self._decomposition_method = TSNE(
                            n_components=n_components,
                            n_jobs=cpu_count(),
                            random_state=random_state,
                            verbose=True,
                            **decomposition_kwargs
                        )
                    except:
                        raise ModuleNotFoundError(
                            "You do not have installed a supported TSNE "
                            "decomposition algorithm. Depending on your use case, "
                            "we suggest you install tsne-cuda if your graph is "
                            "very big (in the millions of nodes) if you have access "
                            "to a compatible GPU system.\n"
                            "Alternatively, we suggest (and support) MulticoreTSNE, "
                            "which tends to be easier to install, and is significantly "
                            "faster than the Sklearn implementation.\n"
                            "Alternatively, we suggest (and support) MulticoreTSNE, "
                            "which tends to be easier to install, and is significantly "
                            "faster than the Sklearn implementation.\n"
                            "If you intend to do 3D decompositions, "
                            "remember that tsne-cuda, at the time of writing, "
                            "does not support them."
                        )
        elif decomposition_method == "PCA":
            self._decomposition_method = PCA(
                n_components=n_components,
                random_state=random_state,
                **decomposition_kwargs
            )
        else:
            raise ValueError(
                "We currently only support PCA and TSNE decomposition methods."
            )

Create new GraphVisualization object.

Parameters
  • graph (Graph,): The graph to visualize.
  • decomposition_method (str = "TSNE",): The decomposition method to use. The supported methods are TSNE and PCA.
  • scaler_method ("Scaler" = RobustScaler,): The scaler object to use to normalize the embedding. By default we use a Robust Scaler. Pass None to not use any scaler.
  • n_components (int = 2,): Number of components to reduce the image to. Currently, we only support 2D decompositions but we plan to add support for also 3D decompositions.
  • node_embedding_method_name (str = None,): Name of the node embedding method used. If provided, it is added to the images titles.
  • edge_embedding_method (str = "Hadamard",): Edge embedding method. Can either be 'Hadamard', 'Sum', 'Average', 'L1', 'AbsoluteL1', 'L2' or 'Concatenate'.
  • subsample_points (int = 20_000,): Number of points to subsample. Some graphs have a number of nodes and edges in the millions. Using non-CUDA versions of TSNE, the dimensionality reduction procedure can take a considerable amount of time. For this porpose, we include the possibility to subsample the points to the given number. The subsampling is done in a way that takes into consideration the node types and/or edge types (the subsampling is applied separately to the two different sets) by using a Stratified Shuffle Split if there are node types or edge types. Otherwise, a normal train test split is used. If None is given, no subsampling is executed.
  • random_state (int = 42,): The random state to reproduce the visualizations.
  • decomposition_kwargs (Dict = None,): Kwargs to forward to the selected decomposition method.
Raises
  • ValueError,: If the target decomposition size is not supported.
  • ModuleNotFoundError,: If TSNE decomposition has been required and no module supporting it is installed.
#   DEFAULT_SCATTER_KWARGS = {'s': 5, 'alpha': 0.7}
#   DEFAULT_SUBPLOT_KWARGS = {'figsize': (7, 7), 'dpi': 200}
#   def decompose(self, X: numpy.ndarray) -> numpy.ndarray:
View Source
    def decompose(self, X: np.ndarray) -> np.ndarray:
        """Return requested decomposition of given array.

        Parameters
        -----------------------
        X: np.ndarray,
            The data to embed.

        Raises
        -----------------------
        ValueError,
            If the given vector has less components than the required
            decomposition target.

        Returns
        -----------------------
        The obtained decomposition.
        """
        if X.shape[1] == self._n_components:
            return X
        if X.shape[1] < self._n_components:
            raise ValueError(
                "The vector to decompose has less components than "
                "the decomposition target."
            )
        return self._decomposition_method.fit_transform(X)

Return requested decomposition of given array.

Parameters
  • X (np.ndarray,): The data to embed.
Raises
  • ValueError,: If the given vector has less components than the required decomposition target.
Returns
  • The obtained decomposition.
#   def fit_transform_nodes(self, node_embedding: pandas.core.frame.DataFrame):
View Source
    def fit_transform_nodes(
        self,
        node_embedding: pd.DataFrame
    ):
        """Executes fitting for plotting node embeddings.

        Parameters
        -------------------------
        node_embedding: pd.DataFrame,
            Embedding of the graph nodes.
        """
        # Retrieve the nodes
        node_names = np.array(self._graph.get_node_names())
        # If necessary, we proceed with the subsampling
        if self._subsample_points is not None and self._graph.get_number_of_nodes() > self._subsample_points:
            # If there are node types, we use a stratified
            # node sampling so that all the nodes types may be displayed.
            if self._graph.has_node_types() and not self._graph.has_singleton_node_types():
                Splitter = StratifiedShuffleSplit
            else:
                # Otherwise there is no need to stratify.
                Splitter = ShuffleSplit
            # We compute the indices
            self._subsampled_node_ids, _ = next(Splitter(
                n_splits=1,
                train_size=self._subsample_points,
                random_state=self._random_state
            ).split(node_names, self._flatten_multi_label_and_unknown_node_types()))
            # And sample the nodes
            node_names = node_names[self._subsampled_node_ids]

        if self._scaler_method is not None:
            node_embedding = pd.DataFrame(
                self._scaler_method.fit_transform(node_embedding),
                columns=node_embedding.columns,
                index=node_embedding.index,
            )
        self._node_transformer.fit(node_embedding)
        self._node_embedding = pd.DataFrame(
            self.decompose(
                self._node_transformer.transform(node_names)
            ),
            index=node_names
        )

Executes fitting for plotting node embeddings.

Parameters
  • node_embedding (pd.DataFrame,): Embedding of the graph nodes.
#   def fit_transform_edges( self, node_embedding: Union[pandas.core.frame.DataFrame, NoneType] = None, edge_embedding: Union[pandas.core.frame.DataFrame, NoneType] = None ):
View Source
    def fit_transform_edges(
        self,
        node_embedding: Optional[pd.DataFrame] = None,
        edge_embedding: Optional[pd.DataFrame] = None,
    ):
        """Executes fitting for plotting edge embeddings.

        Parameters
        -------------------------
        node_embedding: Optional[pd.DataFrame] = None,
            Node embedding obtained from SkipGram, CBOW or GloVe or others.
        node_embedding: Optional[pd.DataFrame] = None,
            Edge embedding.

        Raises
        -------------------------
        ValueError,
            If neither the node embedding nor the edge embedding have
            been provided. You need to provide exactly one of the two.
        ValueError,
            If the shape of the given node embedding does not match
            the number of nodes in the graph.
        ValueError,
            If the shape of the given node embedding does not match
            the number of edges in the graph.   
        """
        if node_embedding is None and edge_embedding is None:
            raise ValueError(
                "You need to provide either the node embedding or the "
                "edge embedding."
            )
        if node_embedding is not None and edge_embedding is not None:
            raise ValueError(
                "You need to provide either the node embedding or the "
                "edge embedding. You cannot provide both at once."
            )
        if node_embedding is not None and node_embedding.shape[0] != self._graph.get_number_of_nodes():
            raise ValueError(
                ("The number of rows provided with the given node embedding {} "
                 "does not match the number of nodes in the graph {}.").format(
                    node_embedding.shape[0],
                    self._graph.get_number_of_nodes()
                )
            )
        if edge_embedding is not None and edge_embedding.shape[0] != self._graph.get_directed_edges_number():
            raise ValueError(
                ("The number of rows provided with the given edge embedding {} "
                 "does not match the number of directed edges in the graph {}.").format(
                    edge_embedding.shape[0],
                    self._graph.get_directed_edges_number()
                )
            )

        # Retrieve the edges
        edge_names = np.array(self._graph.get_edge_node_names(directed=True))
        # If necessary, we proceed with the subsampling
        if self._subsample_points is not None and len(edge_names) > self._subsample_points:
            # If there are edge types, we use a stratified
            # edge sampling so that all the edges types may be displayed.
            if self._graph.has_edge_types() and not self._graph.has_singleton_edge_types():
                Splitter = StratifiedShuffleSplit
            else:
                # Otherwise there is no need to stratify.
                Splitter = ShuffleSplit
            # We compute the indices
            self._subsampled_edge_ids, _ = next(Splitter(
                n_splits=1,
                train_size=self._subsample_points,
                random_state=self._random_state
            ).split(edge_names, self._flatten_unknown_edge_types()))
            # And sample the edges
            edge_names = edge_names[self._subsampled_edge_ids]
            if edge_embedding is not None:
                edge_embedding = edge_embedding[self._subsampled_edge_ids]

        if node_embedding is not None:
            if self._scaler_method is not None:
                node_embedding = pd.DataFrame(
                    self._scaler_method.fit_transform(node_embedding),
                    columns=node_embedding.columns,
                    index=node_embedding.index,
                )
            self._graph_transformer.fit(node_embedding)
            edge_embedding = self._graph_transformer.transform(edge_names)
        self._edge_embedding = pd.DataFrame(
            self.decompose(edge_embedding),
            index=edge_names
        )

Executes fitting for plotting edge embeddings.

Parameters
  • node_embedding (Optional[pd.DataFrame] = None,): Node embedding obtained from SkipGram, CBOW or GloVe or others.
  • node_embedding (Optional[pd.DataFrame] = None,): Edge embedding.
Raises
  • ValueError,: If neither the node embedding nor the edge embedding have been provided. You need to provide exactly one of the two.
  • ValueError,: If the shape of the given node embedding does not match the number of nodes in the graph.
  • ValueError,: If the shape of the given node embedding does not match the number of edges in the graph.
#   def plot_edge_segments( self, points: numpy.ndarray, figure: matplotlib.figure.Figure = None, axes: matplotlib.axes._axes.Axes = None, **kwargs: Dict ) -> Tuple[matplotlib.figure.Figure, matplotlib.axes._axes.Axes]:
View Source
    def plot_edge_segments(
        self,
        points: np.ndarray,
        figure: Figure = None,
        axes: Axes = None,
        **kwargs: Dict
    ) -> Tuple[Figure, Axes]:
        if figure is None or axes is None:
            figure, axes = self._get_figure_and_axes(**kwargs)

        if self._subsampled_node_ids is not None:
            edge_node_ids = self._graph.get_edge_ids_from_node_ids(
                node_ids=self._subsampled_node_ids,
                add_selfloops_where_missing=False,
                complete=False,
            )
        else:
            edge_node_ids = self._graph.get_edge_node_ids(
                directed=False
            )

        lines_collection = mc.LineCollection(
            points[edge_node_ids],
            linewidths=1,
            zorder=0
        )
        axes.add_collection(lines_collection)

        return figure, axes
#   def plot_nodes( self, figure: matplotlib.figure.Figure = None, axes: matplotlib.axes._axes.Axes = None, scatter_kwargs: Dict = None, train_indices: numpy.ndarray = None, test_indices: numpy.ndarray = None, train_marker: str = 'o', test_marker: str = 'X', show_title: bool = True, show_legend: bool = True, annotate_nodes: Union[str, bool] = 'auto', show_edges: bool = False, **kwargs: Dict ) -> Tuple[matplotlib.figure.Figure, matplotlib.axes._axes.Axes]:
View Source
    def plot_nodes(
        self,
        figure: Figure = None,
        axes: Axes = None,
        scatter_kwargs: Dict = None,
        train_indices: np.ndarray = None,
        test_indices: np.ndarray = None,
        train_marker: str = "o",
        test_marker: str = "X",
        show_title: bool = True,
        show_legend: bool = True,
        annotate_nodes: Union[str, bool] = "auto",
        show_edges: bool = False,
        **kwargs: Dict
    ) -> Tuple[Figure, Axes]:
        """Plot nodes of provided graph.

        Parameters
        ------------------------------
        figure: Figure = None,
            Figure to use to plot. If None, a new one is created using the
            provided kwargs.
        axes: Axes = None,
            Axes to use to plot. If None, a new one is created using the
            provided kwargs.
        scatter_kwargs: Dict = None,
            Kwargs to pass to the scatter plot call.
        train_indices: np.ndarray = None,
            Indices to draw using the training marker.
            If None, all points are drawn using the training marker.
        test_indices: np.ndarray = None,
            Indices to draw using the test marker.
            If None, while providing the train indices,
        train_marker: str = "o",
            The marker to use to draw the training points.
        test_marker: str = "X",
            The marker to use to draw the test points.
        show_title: bool = True,
            Whether to show the figure title.
        show_legend: bool = True,
            Whether to show the legend.
        annotate_nodes: Union[str, bool] = "auto",
            Whether to show the node name when scattering them.
            The default behaviour, "auto", means that it will
            enable this feature automatically when the graph has
            less than 100 nodes.
        **kwargs: Dict,
            Arguments to pass to the subplots.

        Raises
        ------------------------------
        ValueError,
            If edge fitting was not yet executed.

        Returns
        ------------------------------
        Figure and Axis of the plot.
        """
        if self._node_embedding is None:
            raise ValueError(
                "Node fitting must be executed before plot."
            )

        if annotate_nodes == "auto":
            annotate_nodes = self._graph.get_number_of_nodes() < 100

        if show_edges:
            figure, axes = self.plot_edge_segments(
                self._node_embedding.values,
                figure,
                axes,
                **kwargs
            )

        figure, axes, _ = self._plot_scatter(
            "Nodes embedding",
            self._node_embedding.values,
            figure=figure,
            axes=axes,
            scatter_kwargs=scatter_kwargs,
            train_indices=train_indices,
            test_indices=test_indices,
            train_marker=train_marker,
            test_marker=test_marker,
            show_title=show_title,
            show_legend=show_legend,
            **kwargs
        )

        if annotate_nodes:
            figure, axes = self.annotate_nodes(
                figure=figure,
                axes=axes,
                points=self._node_embedding.values,
            )

        return figure, axes

Plot nodes of provided graph.

Parameters
  • figure (Figure = None,): Figure to use to plot. If None, a new one is created using the provided kwargs.
  • axes (Axes = None,): Axes to use to plot. If None, a new one is created using the provided kwargs.
  • scatter_kwargs (Dict = None,): Kwargs to pass to the scatter plot call.
  • train_indices (np.ndarray = None,): Indices to draw using the training marker. If None, all points are drawn using the training marker.
  • test_indices (np.ndarray = None,): Indices to draw using the test marker. If None, while providing the train indices,
  • train_marker (str = "o",): The marker to use to draw the training points.
  • test_marker (str = "X",): The marker to use to draw the test points.
  • show_title (bool = True,): Whether to show the figure title.
  • show_legend (bool = True,): Whether to show the legend.
  • annotate_nodes (Union[str, bool] = "auto",): Whether to show the node name when scattering them. The default behaviour, "auto", means that it will enable this feature automatically when the graph has less than 100 nodes.
  • **kwargs (Dict,): Arguments to pass to the subplots.
Raises
  • ValueError,: If edge fitting was not yet executed.
Returns
  • Figure and Axis of the plot.
#   def annotate_nodes( self, figure: matplotlib.figure.Figure, axes: matplotlib.axes._axes.Axes, points: numpy.ndarray ) -> Tuple[matplotlib.figure.Figure, matplotlib.axes._axes.Axes]:
View Source
    def annotate_nodes(
        self,
        figure: Figure,
        axes: Axes,
        points: np.ndarray
    ) -> Tuple[Figure, Axes]:
        if self._subsampled_node_ids is not None:
            node_names = [
                self._graph.get_node_name_from_node_id(node_id)
                for node_id in self._subsampled_node_ids
            ]
        else:
            node_names = self._graph.get_node_names()
        for i, txt in enumerate(node_names):
            axes.annotate(txt, points[i])
        return (figure, axes)
#   def plot_edges( self, figure: matplotlib.figure.Figure = None, axes: matplotlib.axes._axes.Axes = None, scatter_kwargs: Dict = None, train_indices: numpy.ndarray = None, test_indices: numpy.ndarray = None, train_marker: str = 'o', test_marker: str = 'X', show_title: bool = True, show_legend: bool = True, **kwargs: Dict ) -> Tuple[matplotlib.figure.Figure, matplotlib.axes._axes.Axes]:
View Source
    def plot_edges(
        self,
        figure: Figure = None,
        axes: Axes = None,
        scatter_kwargs: Dict = None,
        train_indices: np.ndarray = None,
        test_indices: np.ndarray = None,
        train_marker: str = "o",
        test_marker: str = "X",
        show_title: bool = True,
        show_legend: bool = True,
        **kwargs: Dict
    ) -> Tuple[Figure, Axes]:
        """Plot edge embedding of provided graph.

        Parameters
        ------------------------------
        figure: Figure = None,
            Figure to use to plot. If None, a new one is created using the
            provided kwargs.
        axes: Axes = None,
            Axes to use to plot. If None, a new one is created using the
            provided kwargs.
        scatter_kwargs: Dict = None,
            Kwargs to pass to the scatter plot call.
        train_indices: np.ndarray = None,
            Indices to draw using the training marker.
            If None, all points are drawn using the training marker.
        test_indices: np.ndarray = None,
            Indices to draw using the test marker.
            If None, while providing the train indices,
        train_marker: str = "o",
            The marker to use to draw the training points.
        test_marker: str = "X",
            The marker to use to draw the test points.
        show_title: bool = True,
            Whether to show the figure title.
        show_legend: bool = True,
            Whether to show the legend.
        **kwargs: Dict,
            Arguments to pass to the subplots.

        Raises
        ------------------------------
        ValueError,
            If edge fitting was not yet executed.

        Returns
        ------------------------------
        Figure and Axis of the plot.
        """
        if self._edge_embedding is None:
            raise ValueError(
                "Edge fitting must be executed before plot."
            )

        figure, axis, _ = self._plot_scatter(
            "Edges embedding",
            self._edge_embedding,
            figure=figure,
            axes=axes,
            scatter_kwargs=scatter_kwargs,
            train_indices=train_indices,
            test_indices=test_indices,
            train_marker=train_marker,
            test_marker=test_marker,
            show_title=show_title,
            show_legend=show_legend,
            **kwargs
        )

        return figure, axis

Plot edge embedding of provided graph.

Parameters
  • figure (Figure = None,): Figure to use to plot. If None, a new one is created using the provided kwargs.
  • axes (Axes = None,): Axes to use to plot. If None, a new one is created using the provided kwargs.
  • scatter_kwargs (Dict = None,): Kwargs to pass to the scatter plot call.
  • train_indices (np.ndarray = None,): Indices to draw using the training marker. If None, all points are drawn using the training marker.
  • test_indices (np.ndarray = None,): Indices to draw using the test marker. If None, while providing the train indices,
  • train_marker (str = "o",): The marker to use to draw the training points.
  • test_marker (str = "X",): The marker to use to draw the test points.
  • show_title (bool = True,): Whether to show the figure title.
  • show_legend (bool = True,): Whether to show the legend.
  • **kwargs (Dict,): Arguments to pass to the subplots.
Raises
  • ValueError,: If edge fitting was not yet executed.
Returns
  • Figure and Axis of the plot.
#   def plot_node_types( self, node_type_predictions: List[int] = None, k: int = 9, figure: matplotlib.figure.Figure = None, axes: matplotlib.axes._axes.Axes = None, scatter_kwargs: Dict = None, legend_title: str = 'Node types', other_label: str = 'Other', train_indices: numpy.ndarray = None, test_indices: numpy.ndarray = None, train_marker: str = 'o', test_marker: str = 'X', show_title: bool = True, show_legend: bool = True, show_edges: bool = False, annotate_nodes: Union[str, bool] = 'auto', **kwargs ) -> Tuple[matplotlib.figure.Figure, matplotlib.axes._axes.Axes]:
View Source
    def plot_node_types(
        self,
        node_type_predictions: List[int] = None,
        k: int = 9,
        figure: Figure = None,
        axes: Axes = None,
        scatter_kwargs: Dict = None,
        legend_title: str = "Node types",
        other_label: str = "Other",
        train_indices: np.ndarray = None,
        test_indices: np.ndarray = None,
        train_marker: str = "o",
        test_marker: str = "X",
        show_title: bool = True,
        show_legend: bool = True,
        show_edges: bool = False,
        annotate_nodes: Union[str, bool] = "auto",
        **kwargs
    ) -> Tuple[Figure, Axes]:
        """Plot common node types of provided graph.

        Parameters
        ------------------------------
        node_type_predictions: List[int] = None,
            Predictions of the node types.
        k: int = 9,
            Number of node types to visualize.
        figure: Figure = None,
            Figure to use to plot. If None, a new one is created using the
            provided kwargs.
        axes: Axes = None,
            Axes to use to plot. If None, a new one is created using the
            provided kwargs.
        scatter_kwargs: Dict = None,
            Kwargs to pass to the scatter plot call.
        other_label: str = "Other",
            Label to use for edges below the top k threshold.
        train_indices: np.ndarray = None,
            Indices to draw using the training marker.
            If None, all points are drawn using the training marker.
        test_indices: np.ndarray = None,
            Indices to draw using the test marker.
            If None, while providing the train indices,
        train_marker: str = "o",
            The marker to use to draw the training points.
        test_marker: str = "X",
            The marker to use to draw the test points.
        show_title: bool = True,
            Whether to show the figure title.
        show_legend: bool = True,
            Whether to show the legend.
        **kwargs: Dict,
            Arguments to pass to the subplots.

        Raises
        ------------------------------
        ValueError,
            If edge fitting was not yet executed.
        ValueError,
            If given k is greater than maximum supported value (10).

        Returns
        ------------------------------
        Figure and Axis of the plot.
        """
        if not self._graph.has_node_types():
            raise ValueError(
                "The graph does not have node types!"
            )

        if self._node_embedding is None:
            raise ValueError(
                "Node fitting must be executed before plot."
            )

        if show_edges:
            figure, axes = self.plot_edge_segments(
                self._node_embedding.values,
                figure,
                axes,
                **kwargs
            )

        if annotate_nodes == "auto":
            annotate_nodes = self._graph.get_number_of_nodes() < 100

        node_types = self._flatten_multi_label_and_unknown_node_types()
        if self._subsampled_node_ids is not None:
            node_types = node_types[self._subsampled_node_ids]

        node_type_names = self._graph.get_unique_node_type_names()

        if self._graph.has_unknown_node_types():
            node_type_names.append("Unknown")

        node_type_names = np.array(node_type_names)

        figure, axes = self._plot_types(
            "Node types",
            self._node_embedding.values,
            types=node_types,
            type_labels=node_type_names,
            legend_title=legend_title,
            predictions=node_type_predictions,
            k=k,
            figure=figure,
            axes=axes,
            scatter_kwargs=scatter_kwargs,
            other_label=other_label,
            train_indices=train_indices,
            test_indices=test_indices,
            train_marker=train_marker,
            test_marker=test_marker,
            show_title=show_title,
            show_legend=show_legend,
            **kwargs
        )

        if annotate_nodes:
            figure, axes = self.annotate_nodes(
                figure=figure,
                axes=axes,
                points=self._node_embedding.values,
            )

        return figure, axes

Plot common node types of provided graph.

Parameters
  • node_type_predictions (List[int] = None,): Predictions of the node types.
  • k (int = 9,): Number of node types to visualize.
  • figure (Figure = None,): Figure to use to plot. If None, a new one is created using the provided kwargs.
  • axes (Axes = None,): Axes to use to plot. If None, a new one is created using the provided kwargs.
  • scatter_kwargs (Dict = None,): Kwargs to pass to the scatter plot call.
  • other_label (str = "Other",): Label to use for edges below the top k threshold.
  • train_indices (np.ndarray = None,): Indices to draw using the training marker. If None, all points are drawn using the training marker.
  • test_indices (np.ndarray = None,): Indices to draw using the test marker. If None, while providing the train indices,
  • train_marker (str = "o",): The marker to use to draw the training points.
  • test_marker (str = "X",): The marker to use to draw the test points.
  • show_title (bool = True,): Whether to show the figure title.
  • show_legend (bool = True,): Whether to show the legend.
  • **kwargs (Dict,): Arguments to pass to the subplots.
Raises
  • ValueError,: If edge fitting was not yet executed.
  • ValueError,: If given k is greater than maximum supported value (10).
Returns
  • Figure and Axis of the plot.
#   def plot_connected_components( self, k: int = 9, figure: matplotlib.figure.Figure = None, axes: matplotlib.axes._axes.Axes = None, scatter_kwargs: Dict = None, other_label: str = 'Other', legend_title: str = 'Component sizes', train_indices: numpy.ndarray = None, test_indices: numpy.ndarray = None, train_marker: str = 'o', test_marker: str = 'X', show_title: bool = True, show_legend: bool = True, show_edges: bool = False, annotate_nodes: Union[str, bool] = 'auto', **kwargs ) -> Tuple[matplotlib.figure.Figure, matplotlib.axes._axes.Axes]:
View Source
    def plot_connected_components(
        self,
        k: int = 9,
        figure: Figure = None,
        axes: Axes = None,
        scatter_kwargs: Dict = None,
        other_label: str = "Other",
        legend_title: str = "Component sizes",
        train_indices: np.ndarray = None,
        test_indices: np.ndarray = None,
        train_marker: str = "o",
        test_marker: str = "X",
        show_title: bool = True,
        show_legend: bool = True,
        show_edges: bool = False,
        annotate_nodes: Union[str, bool] = "auto",
        **kwargs
    ) -> Tuple[Figure, Axes]:
        """Plot common node types of provided graph.

        Parameters
        ------------------------------
        k: int = 9,
            Number of components to visualize.
        figure: Figure = None,
            Figure to use to plot. If None, a new one is created using the
            provided kwargs.
        axes: Axes = None,
            Axes to use to plot. If None, a new one is created using the
            provided kwargs.
        scatter_kwargs: Dict = None,
            Kwargs to pass to the scatter plot call.
        other_label: str = "Other",
            Label to use for edges below the top k threshold.
        legend_title: str = "Component sizes",
            Title for the legend.
        train_indices: np.ndarray = None,
            Indices to draw using the training marker.
            If None, all points are drawn using the training marker.
        test_indices: np.ndarray = None,
            Indices to draw using the test marker.
            If None, while providing the train indices,
        train_marker: str = "o",
            The marker to use to draw the training points.
        test_marker: str = "X",
            The marker to use to draw the test points.
        show_title: bool = True,
            Whether to show the figure title.
        show_legend: bool = True,
            Whether to show the legend.
        **kwargs: Dict,
            Arguments to pass to the subplots.

        Raises
        ------------------------------
        ValueError,
            If edge fitting was not yet executed.
        ValueError,
            If given k is greater than maximum supported value (10).

        Returns
        ------------------------------
        Figure and Axis of the plot.
        """
        if self._node_embedding is None:
            raise ValueError(
                "Node fitting must be executed before plot."
            )

        if show_edges:
            figure, axes = self.plot_edge_segments(
                self._node_embedding.values,
                figure,
                axes,
                **kwargs
            )

        if annotate_nodes == "auto":
            annotate_nodes = self._graph.get_number_of_nodes() < 100

        components, components_number, _, _ = self._graph.connected_components()
        sizes = np.bincount(components, minlength=components_number)

        if self._subsampled_node_ids is not None:
            components = components[self._subsampled_node_ids]

        figure, axes = self._plot_types(
            "Components",
            self._node_embedding.values,
            types=components,
            type_labels=np.array([
                "Size {}".format(size)
                for size in sizes
            ]),
            legend_title=legend_title,
            show_title=show_title,
            show_legend=show_legend,
            k=k,
            figure=figure,
            axes=axes,
            scatter_kwargs=scatter_kwargs,
            other_label=other_label,
            train_indices=train_indices,
            test_indices=test_indices,
            train_marker=train_marker,
            test_marker=test_marker,
            **kwargs
        )

        if annotate_nodes:
            figure, axes = self.annotate_nodes(
                figure=figure,
                axes=axes,
                points=self._node_embedding.values,
            )

        return figure, axes

Plot common node types of provided graph.

Parameters
  • k (int = 9,): Number of components to visualize.
  • figure (Figure = None,): Figure to use to plot. If None, a new one is created using the provided kwargs.
  • axes (Axes = None,): Axes to use to plot. If None, a new one is created using the provided kwargs.
  • scatter_kwargs (Dict = None,): Kwargs to pass to the scatter plot call.
  • other_label (str = "Other",): Label to use for edges below the top k threshold.
  • legend_title (str = "Component sizes",): Title for the legend.
  • train_indices (np.ndarray = None,): Indices to draw using the training marker. If None, all points are drawn using the training marker.
  • test_indices (np.ndarray = None,): Indices to draw using the test marker. If None, while providing the train indices,
  • train_marker (str = "o",): The marker to use to draw the training points.
  • test_marker (str = "X",): The marker to use to draw the test points.
  • show_title (bool = True,): Whether to show the figure title.
  • show_legend (bool = True,): Whether to show the legend.
  • **kwargs (Dict,): Arguments to pass to the subplots.
Raises
  • ValueError,: If edge fitting was not yet executed.
  • ValueError,: If given k is greater than maximum supported value (10).
Returns
  • Figure and Axis of the plot.
#   def plot_node_degrees( self, figure: matplotlib.figure.Figure = None, axes: matplotlib.axes._axes.Axes = None, scatter_kwargs: Dict = None, train_indices: numpy.ndarray = None, test_indices: numpy.ndarray = None, train_marker: str = 'o', test_marker: str = 'X', use_log_scale: bool = True, show_title: bool = True, show_legend: bool = True, show_edges: bool = False, annotate_nodes: Union[str, bool] = 'auto', **kwargs: Dict ):
View Source
    def plot_node_degrees(
        self,
        figure: Figure = None,
        axes: Axes = None,
        scatter_kwargs: Dict = None,
        train_indices: np.ndarray = None,
        test_indices: np.ndarray = None,
        train_marker: str = "o",
        test_marker: str = "X",
        use_log_scale: bool = True,
        show_title: bool = True,
        show_legend: bool = True,
        show_edges: bool = False,
        annotate_nodes: Union[str, bool] = "auto",
        **kwargs: Dict
    ):
        """Plot node degrees heatmap.

        Parameters
        ------------------------------
        figure: Figure = None,
            Figure to use to plot. If None, a new one is created using the
            provided kwargs.
        axes: Axes = None,
            Axes to use to plot. If None, a new one is created using the
            provided kwargs.
        scatter_kwargs: Dict = None,
            Kwargs to pass to the scatter plot call.
        train_indices: np.ndarray = None,
            Indices to draw using the training marker.
            If None, all points are drawn using the training marker.
        test_indices: np.ndarray = None,
            Indices to draw using the test marker.
            If None, while providing the train indices, 
        train_marker: str = "o",
            The marker to use to draw the training points.
        test_marker: str = "X",
            The marker to use to draw the test points.
        use_log_scale: bool = True,
            Whether to use log scale.
        show_title: bool = True,
            Whether to show the figure title.
        show_legend: bool = True,
            Whether to show the legend.
        **kwargs: Dict,
            Additional kwargs for the subplots.

        Raises
        ------------------------------
        ValueError,
            If edge fitting was not yet executed.

        Returns
        ------------------------------
        Figure and Axis of the plot.
        """
        if self._node_embedding is None:
            raise ValueError(
                "Node fitting must be executed before plot."
            )

        degrees = self._graph.get_node_degrees()
        if self._subsampled_node_ids is not None:
            degrees = degrees[self._subsampled_node_ids]

        if annotate_nodes == "auto":
            annotate_nodes = self._graph.get_number_of_nodes() < 100

        if show_edges:
            figure, axes = self.plot_edge_segments(
                self._node_embedding.values,
                figure,
                axes,
                **kwargs
            )

        figure, axes, scatter = self._plot_scatter(
            "Node degrees",
            self._node_embedding.values,
            colors=degrees,
            figure=figure,
            axes=axes,
            scatter_kwargs={
                **({} if scatter_kwargs is None else scatter_kwargs),
                "cmap": plt.cm.get_cmap('RdYlBu'),
                **({"norm": LogNorm()} if use_log_scale else {})
            },
            train_indices=train_indices,
            test_indices=test_indices,
            train_marker=train_marker,
            test_marker=test_marker,
            show_title=show_title,
            show_legend=show_legend,
            **kwargs
        )

        color_bar = figure.colorbar(scatter[0], ax=axes)
        color_bar.set_alpha(1)
        color_bar.draw_all()

        if annotate_nodes:
            figure, axes = self.annotate_nodes(
                figure=figure,
                axes=axes,
                points=self._node_embedding.values,
            )

        return figure, axes

Plot node degrees heatmap.

Parameters
  • figure (Figure = None,): Figure to use to plot. If None, a new one is created using the provided kwargs.
  • axes (Axes = None,): Axes to use to plot. If None, a new one is created using the provided kwargs.
  • scatter_kwargs (Dict = None,): Kwargs to pass to the scatter plot call.
  • train_indices (np.ndarray = None,): Indices to draw using the training marker. If None, all points are drawn using the training marker.
  • test_indices (np.ndarray = None,): Indices to draw using the test marker. If None, while providing the train indices,
  • train_marker (str = "o",): The marker to use to draw the training points.
  • test_marker (str = "X",): The marker to use to draw the test points.
  • use_log_scale (bool = True,): Whether to use log scale.
  • show_title (bool = True,): Whether to show the figure title.
  • show_legend (bool = True,): Whether to show the legend.
  • **kwargs (Dict,): Additional kwargs for the subplots.
Raises
  • ValueError,: If edge fitting was not yet executed.
Returns
  • Figure and Axis of the plot.
#   def plot_edge_types( self, edge_type_predictions: List[int] = None, k: int = 9, figure: matplotlib.figure.Figure = None, axes: matplotlib.axes._axes.Axes = None, scatter_kwargs: Dict = None, other_label: str = 'Other', legend_title: str = 'Edge types', train_indices: numpy.ndarray = None, test_indices: numpy.ndarray = None, train_marker: str = 'o', test_marker: str = 'X', show_title: bool = True, show_legend: bool = True, **kwargs: Dict ):
View Source
    def plot_edge_types(
        self,
        edge_type_predictions: List[int] = None,
        k: int = 9,
        figure: Figure = None,
        axes: Axes = None,
        scatter_kwargs: Dict = None,
        other_label: str = "Other",
        legend_title: str = "Edge types",
        train_indices: np.ndarray = None,
        test_indices: np.ndarray = None,
        train_marker: str = "o",
        test_marker: str = "X",
        show_title: bool = True,
        show_legend: bool = True,
        **kwargs: Dict
    ):
        """Plot common edge types of provided graph.

        Parameters
        ------------------------------
        edge_type_predictions: List[int] = None,
            Predictions of the edge types.
        k: int = 9,
            Number of edge types to visualize.
        figure: Figure = None,
            Figure to use to plot. If None, a new one is created using the
            provided kwargs.
        axes: Axes = None,
            Axes to use to plot. If None, a new one is created using the
            provided kwargs.
        scatter_kwargs: Dict = None,
            Kwargs to pass to the scatter plot call.
        other_label: str = "Other",
            Label to use for edges below the top k threshold.
        legend_title: str = "Edge types",
            Title for the legend.
        train_indices: np.ndarray = None,
            Indices to draw using the training marker.
            If None, all points are drawn using the training marker.
        test_indices: np.ndarray = None,
            Indices to draw using the test marker.
            If None, while providing the train indices,
        train_marker: str = "o",
            The marker to use to draw the training points.
        test_marker: str = "X",
            The marker to use to draw the test points.
        show_title: bool = True,
            Whether to show the figure title.
        show_legend: bool = True,
            Whether to show the legend.
        **kwargs: Dict,
            Additional kwargs for the subplots.

        Raises
        ------------------------------
        ValueError,
            If the graph does not have edge types.
        ValueError,
            If edge fitting was not yet executed.
        ValueError,
            If given k is greater than maximum supported value (10).

        Returns
        ------------------------------
        Figure and Axis of the plot.
        """
        if not self._graph.has_edge_types():
            raise ValueError(
                "The graph does not have edge types!"
            )

        if self._edge_embedding is None:
            raise ValueError(
                "Edge fitting was not yet executed!"
            )

        edge_type_number = self._graph.get_edge_types_number()
        edge_types = np.array([
            edge_type_id
            if edge_type_id is not None
            else edge_type_number
            for edge_type_id in self._graph.get_edge_type_ids()
        ])

        if self._subsampled_edge_ids is not None:
            edge_types = edge_types[self._subsampled_edge_ids]

        edge_type_names = self._graph.get_unique_edge_type_names()

        if self._graph.has_unknown_edge_types():
            edge_type_names.append("Unknown")

        edge_type_names = np.array(edge_type_names)

        return self._plot_types(
            "Edge types",
            self._edge_embedding.values,
            types=edge_types,
            type_labels=edge_type_names,
            legend_title=legend_title,
            predictions=edge_type_predictions,
            k=k,
            figure=figure,
            axes=axes,
            scatter_kwargs=scatter_kwargs,
            other_label=other_label,
            train_indices=train_indices,
            test_indices=test_indices,
            train_marker=train_marker,
            test_marker=test_marker,
            show_title=show_title,
            show_legend=show_legend,
            **kwargs
        )

Plot common edge types of provided graph.

Parameters
  • edge_type_predictions (List[int] = None,): Predictions of the edge types.
  • k (int = 9,): Number of edge types to visualize.
  • figure (Figure = None,): Figure to use to plot. If None, a new one is created using the provided kwargs.
  • axes (Axes = None,): Axes to use to plot. If None, a new one is created using the provided kwargs.
  • scatter_kwargs (Dict = None,): Kwargs to pass to the scatter plot call.
  • other_label (str = "Other",): Label to use for edges below the top k threshold.
  • legend_title (str = "Edge types",): Title for the legend.
  • train_indices (np.ndarray = None,): Indices to draw using the training marker. If None, all points are drawn using the training marker.
  • test_indices (np.ndarray = None,): Indices to draw using the test marker. If None, while providing the train indices,
  • train_marker (str = "o",): The marker to use to draw the training points.
  • test_marker (str = "X",): The marker to use to draw the test points.
  • show_title (bool = True,): Whether to show the figure title.
  • show_legend (bool = True,): Whether to show the legend.
  • **kwargs (Dict,): Additional kwargs for the subplots.
Raises
  • ValueError,: If the graph does not have edge types.
  • ValueError,: If edge fitting was not yet executed.
  • ValueError,: If given k is greater than maximum supported value (10).
Returns
  • Figure and Axis of the plot.
#   def plot_edge_weights( self, figure: matplotlib.figure.Figure = None, axes: matplotlib.axes._axes.Axes = None, scatter_kwargs: Dict = None, train_indices: numpy.ndarray = None, test_indices: numpy.ndarray = None, train_marker: str = 'o', test_marker: str = 'X', show_title: bool = True, show_legend: bool = True, **kwargs: Dict ):
View Source
    def plot_edge_weights(
        self,
        figure: Figure = None,
        axes: Axes = None,
        scatter_kwargs: Dict = None,
        train_indices: np.ndarray = None,
        test_indices: np.ndarray = None,
        train_marker: str = "o",
        test_marker: str = "X",
        show_title: bool = True,
        show_legend: bool = True,
        **kwargs: Dict
    ):
        """Plot common edge types of provided graph.

        Parameters
        ------------------------------
        figure: Figure = None,
            Figure to use to plot. If None, a new one is created using the
            provided kwargs.
        axes: Axes = None,
            Axes to use to plot. If None, a new one is created using the
            provided kwargs.
        scatter_kwargs: Dict = None,
            Kwargs to pass to the scatter plot call.
        train_indices: np.ndarray = None,
            Indices to draw using the training marker.
            If None, all points are drawn using the training marker.
        test_indices: np.ndarray = None,
            Indices to draw using the test marker.
            If None, while providing the train indices, 
        train_marker: str = "o",
            The marker to use to draw the training points.
        test_marker: str = "X",
            The marker to use to draw the test points.
        show_title: bool = True,
            Whether to show the figure title.
        show_legend: bool = True,
            Whether to show the legend.
        **kwargs: Dict,
            Additional kwargs for the subplots.

        Raises
        ------------------------------
        ValueError,
            If edge fitting was not yet executed.

        Returns
        ------------------------------
        Figure and Axis of the plot.
        """
        if not self._graph.has_edge_weights():
            raise ValueError(
                "The graph does not have edge weights!"
            )

        if self._edge_embedding is None:
            raise ValueError(
                "Edge fitting must be executed before plot."
            )

        weights = self._graph.get_edge_weights()
        if self._subsampled_edge_ids is not None:
            weights = weights[self._subsampled_edge_ids]

        figure, axes, scatter = self._plot_scatter(
            "Edge weights",
            self._node_embedding.values,
            colors=weights,
            figure=figure,
            axes=axes,
            scatter_kwargs={
                **({} if scatter_kwargs is None else scatter_kwargs),
                "cmap": plt.cm.get_cmap('RdYlBu')
            },
            train_indices=train_indices,
            test_indices=test_indices,
            train_marker=train_marker,
            test_marker=test_marker,
            show_title=show_title,
            show_legend=show_legend,
            **kwargs
        )

        color_bar = figure.colorbar(scatter[0], ax=axes)
        color_bar.set_alpha(1)
        color_bar.draw_all()
        return figure, axes

Plot common edge types of provided graph.

Parameters
  • figure (Figure = None,): Figure to use to plot. If None, a new one is created using the provided kwargs.
  • axes (Axes = None,): Axes to use to plot. If None, a new one is created using the provided kwargs.
  • scatter_kwargs (Dict = None,): Kwargs to pass to the scatter plot call.
  • train_indices (np.ndarray = None,): Indices to draw using the training marker. If None, all points are drawn using the training marker.
  • test_indices (np.ndarray = None,): Indices to draw using the test marker. If None, while providing the train indices,
  • train_marker (str = "o",): The marker to use to draw the training points.
  • test_marker (str = "X",): The marker to use to draw the test points.
  • show_title (bool = True,): Whether to show the figure title.
  • show_legend (bool = True,): Whether to show the legend.
  • **kwargs (Dict,): Additional kwargs for the subplots.
Raises
  • ValueError,: If edge fitting was not yet executed.
Returns
  • Figure and Axis of the plot.
#   def plot_dot(self, engine: str = 'circo'):
View Source
    def plot_dot(self, engine: str = "circo"):
        """Return dot plot of the current graph.
        
        Parameters
        ------------------------------
        engine: str = "circo",
            The engine to use to visualize the graph.
        
        Raises
        ------------------------------
        ModuleNotFoundError,
            If graphviz is not installed.
        """
        try:
            import graphviz
        except ModuleNotFoundError:
            raise ModuleNotFoundError(
                "In order to run the graph Dot visualization, "
                "the graphviz library must be installed. This "
                "library is not an explicit dependency of "
                "Embiggen because it may be hard to install "
                "on some systems and cause the Embiggen library "
                "to fail the installation.\n"
                "In order to install graphviz, try running "
                "`pip install graphviz`."
            )
        return graphviz.Source(
            self._graph.to_dot(),
            engine=engine
        )

Return dot plot of the current graph.

Parameters
  • engine (str = "circo",): The engine to use to visualize the graph.
Raises
  • ModuleNotFoundError,: If graphviz is not installed.
#   class TransE(grape.embiggen.Siamese):
View Source
class TransE(Siamese):
    """Siamese network for node-embedding including optionally node types and edge types."""

    def __init__(
        self,
        graph: Graph,
        embedding_size: int = 100,
        distance_metric: str = "COSINE",
        use_node_types: Union[bool, str] = "auto",
        node_types_combination: str = "Add",
        embedding: Union[np.ndarray, pd.DataFrame] = None,
        extra_features: Union[np.ndarray, pd.DataFrame] = None,
        model_name: str = "TransE",
        optimizer: Union[str, Optimizer] = None,
        support_mirrored_strategy: bool = False,
        use_gradient_centralization: str = "auto"
    ):
        """Create new sequence Embedder model.

        Parameters
        -------------------------------------------
        vocabulary_size: int = None,
            Number of terms to embed.
            In a graph this is the number of nodes, while in a text is the
            number of the unique words.
            If None, the seed embedding must be provided.
            It is not possible to provide both at once.
        embedding_size: int = 100,
            Dimension of the embedding.
            If None, the seed embedding must be provided.
            It is not possible to provide both at once.
        distance_metric: str = "COSINE",
            The distance to use for the loss function.
            Supported methods are L1, L2 and COSINE.
        node_types_combination: str = "Add",
            Method to combine the node embedding with the node type ambedding.
            The supported methods are "Add" and "Concatenate".
        embedding: Union[np.ndarray, pd.DataFrame] = None,
            The seed embedding to be used.
            Note that it is not possible to provide at once both
            the embedding and either the vocabulary size or the embedding size.
        extra_features: Union[np.ndarray, pd.DataFrame] = None,
            Optional extra features to be used during the computation
            of the embedding. The features must be available for all the
            elements considered for the embedding.
        model_name: str = "TransE",
            Name of the model.
        optimizer: Union[str, Optimizer] = "nadam",
            The optimizer to be used during the training of the model.
        support_mirrored_strategy: bool = False,
            Wethever to patch support for mirror strategy.
            At the time of writing, TensorFlow's MirrorStrategy does not support
            input values different from floats, therefore to support it we need
            to convert the unsigned int 32 values that represent the indices of
            the embedding layers we receive from Ensmallen to floats.
            This will generally slow down performance, but in the context of
            exploiting multiple GPUs it may be unnoticeable.
        use_gradient_centralization: bool = True,
            Whether to wrap the provided optimizer into a normalized
            one that centralizes the gradient.
            It is automatically enabled if the current version of
            TensorFlow supports gradient transformers.
            More detail here: https://arxiv.org/pdf/2004.01461.pdf
        """
        super().__init__(
            graph=graph,
            use_node_types=use_node_types,
            node_types_combination=node_types_combination,
            use_edge_types=True,
            node_embedding_size=embedding_size,
            node_type_embedding_size=embedding_size,
            edge_type_embedding_size=embedding_size,
            distance_metric=distance_metric,
            embedding=embedding,
            extra_features=extra_features,
            model_name=model_name,
            optimizer=optimizer,
            support_mirrored_strategy=support_mirrored_strategy,
            use_gradient_centralization=use_gradient_centralization
        )

    def _build_output(
        self,
        source_node_embedding: tf.Tensor,
        destination_node_embedding: tf.Tensor,
        edge_type_embedding: Optional[tf.Tensor] = None,
        edge_types_input: Optional[tf.Tensor] = None,
    ):
        """Return output of the model."""
        return super()._build_output(
            source_node_embedding + edge_type_embedding,
            destination_node_embedding,
            edge_type_embedding,
            edge_types_input
        )

Siamese network for node-embedding including optionally node types and edge types.

#   TransE( graph: Graph, embedding_size: int = 100, distance_metric: str = 'COSINE', use_node_types: Union[bool, str] = 'auto', node_types_combination: str = 'Add', embedding: Union[numpy.ndarray, pandas.core.frame.DataFrame] = None, extra_features: Union[numpy.ndarray, pandas.core.frame.DataFrame] = None, model_name: str = 'TransE', optimizer: Union[str, keras.optimizer_v2.optimizer_v2.OptimizerV2] = None, support_mirrored_strategy: bool = False, use_gradient_centralization: str = 'auto' )
View Source
    def __init__(
        self,
        graph: Graph,
        embedding_size: int = 100,
        distance_metric: str = "COSINE",
        use_node_types: Union[bool, str] = "auto",
        node_types_combination: str = "Add",
        embedding: Union[np.ndarray, pd.DataFrame] = None,
        extra_features: Union[np.ndarray, pd.DataFrame] = None,
        model_name: str = "TransE",
        optimizer: Union[str, Optimizer] = None,
        support_mirrored_strategy: bool = False,
        use_gradient_centralization: str = "auto"
    ):
        """Create new sequence Embedder model.

        Parameters
        -------------------------------------------
        vocabulary_size: int = None,
            Number of terms to embed.
            In a graph this is the number of nodes, while in a text is the
            number of the unique words.
            If None, the seed embedding must be provided.
            It is not possible to provide both at once.
        embedding_size: int = 100,
            Dimension of the embedding.
            If None, the seed embedding must be provided.
            It is not possible to provide both at once.
        distance_metric: str = "COSINE",
            The distance to use for the loss function.
            Supported methods are L1, L2 and COSINE.
        node_types_combination: str = "Add",
            Method to combine the node embedding with the node type ambedding.
            The supported methods are "Add" and "Concatenate".
        embedding: Union[np.ndarray, pd.DataFrame] = None,
            The seed embedding to be used.
            Note that it is not possible to provide at once both
            the embedding and either the vocabulary size or the embedding size.
        extra_features: Union[np.ndarray, pd.DataFrame] = None,
            Optional extra features to be used during the computation
            of the embedding. The features must be available for all the
            elements considered for the embedding.
        model_name: str = "TransE",
            Name of the model.
        optimizer: Union[str, Optimizer] = "nadam",
            The optimizer to be used during the training of the model.
        support_mirrored_strategy: bool = False,
            Wethever to patch support for mirror strategy.
            At the time of writing, TensorFlow's MirrorStrategy does not support
            input values different from floats, therefore to support it we need
            to convert the unsigned int 32 values that represent the indices of
            the embedding layers we receive from Ensmallen to floats.
            This will generally slow down performance, but in the context of
            exploiting multiple GPUs it may be unnoticeable.
        use_gradient_centralization: bool = True,
            Whether to wrap the provided optimizer into a normalized
            one that centralizes the gradient.
            It is automatically enabled if the current version of
            TensorFlow supports gradient transformers.
            More detail here: https://arxiv.org/pdf/2004.01461.pdf
        """
        super().__init__(
            graph=graph,
            use_node_types=use_node_types,
            node_types_combination=node_types_combination,
            use_edge_types=True,
            node_embedding_size=embedding_size,
            node_type_embedding_size=embedding_size,
            edge_type_embedding_size=embedding_size,
            distance_metric=distance_metric,
            embedding=embedding,
            extra_features=extra_features,
            model_name=model_name,
            optimizer=optimizer,
            support_mirrored_strategy=support_mirrored_strategy,
            use_gradient_centralization=use_gradient_centralization
        )

Create new sequence Embedder model.

Parameters
  • vocabulary_size (int = None,): Number of terms to embed. In a graph this is the number of nodes, while in a text is the number of the unique words. If None, the seed embedding must be provided. It is not possible to provide both at once.
  • embedding_size (int = 100,): Dimension of the embedding. If None, the seed embedding must be provided. It is not possible to provide both at once.
  • distance_metric (str = "COSINE",): The distance to use for the loss function. Supported methods are L1, L2 and COSINE.
  • node_types_combination (str = "Add",): Method to combine the node embedding with the node type ambedding. The supported methods are "Add" and "Concatenate".
  • embedding (Union[np.ndarray, pd.DataFrame] = None,): The seed embedding to be used. Note that it is not possible to provide at once both the embedding and either the vocabulary size or the embedding size.
  • extra_features (Union[np.ndarray, pd.DataFrame] = None,): Optional extra features to be used during the computation of the embedding. The features must be available for all the elements considered for the embedding.
  • model_name (str = "TransE",): Name of the model.
  • optimizer (Union[str, Optimizer] = "nadam",): The optimizer to be used during the training of the model.
  • support_mirrored_strategy (bool = False,): Wethever to patch support for mirror strategy. At the time of writing, TensorFlow's MirrorStrategy does not support input values different from floats, therefore to support it we need to convert the unsigned int 32 values that represent the indices of the embedding layers we receive from Ensmallen to floats. This will generally slow down performance, but in the context of exploiting multiple GPUs it may be unnoticeable.
  • use_gradient_centralization (bool = True,): Whether to wrap the provided optimizer into a normalized one that centralizes the gradient. It is automatically enabled if the current version of TensorFlow supports gradient transformers. More detail here: https://arxiv.org/pdf/2004.01461.pdf
Inherited Members
Siamese
NODE_TYPE_EMBEDDING_LAYER_NAME
EDGE_TYPE_EMBEDDING_LAYER_NAME
get_embedding_dataframe
embedding
fit
grape.embiggen.embedders.embedder.Embedder
TERMS_EMBEDDING_LAYER_NAME
trainable
summary
get_layer_weights
save_embedding
name
save_weights
load_weights
#   class TransH(grape.embiggen.TransE):
View Source
class TransH(TransE):
    """Siamese network for node-embedding including optionally node types and edge types."""

    def __init__(
        self,
        graph: Graph,
        embedding_size: int = 100,
        distance_metric: str = "COSINE",
        embedding: Union[np.ndarray, pd.DataFrame] = None,
        extra_features: Union[np.ndarray, pd.DataFrame] = None,
        model_name: str = "TransH",
        optimizer: Union[str, Optimizer] = None,
        support_mirrored_strategy: bool = False,
        use_gradient_centralization: str = "auto"
    ):
        """Create new sequence Embedder model.

        Parameters
        -------------------------------------------
        vocabulary_size: int = None,
            Number of terms to embed.
            In a graph this is the number of nodes, while in a text is the
            number of the unique words.
            If None, the seed embedding must be provided.
            It is not possible to provide both at once.
        embedding_size: int = 100,
            Dimension of the embedding.
            If None, the seed embedding must be provided.
            It is not possible to provide both at once.
        distance_metric: str = "COSINE",
            The distance to use for the loss function.
            Supported methods are L1, L2 and COSINE.
        embedding: Union[np.ndarray, pd.DataFrame] = None,
            The seed embedding to be used.
            Note that it is not possible to provide at once both
            the embedding and either the vocabulary size or the embedding size.
        extra_features: Union[np.ndarray, pd.DataFrame] = None,
            Optional extra features to be used during the computation
            of the embedding. The features must be available for all the
            elements considered for the embedding.
        model_name: str = "TransH",
            Name of the model.
        optimizer: Union[str, Optimizer] = "nadam",
            The optimizer to be used during the training of the model.
        support_mirrored_strategy: bool = False,
            Wethever to patch support for mirror strategy.
            At the time of writing, TensorFlow's MirrorStrategy does not support
            input values different from floats, therefore to support it we need
            to convert the unsigned int 32 values that represent the indices of
            the embedding layers we receive from Ensmallen to floats.
            This will generally slow down performance, but in the context of
            exploiting multiple GPUs it may be unnoticeable.
        use_gradient_centralization: bool = True,
            Whether to wrap the provided optimizer into a normalized
            one that centralizes the gradient.
            It is automatically enabled if the current version of
            TensorFlow supports gradient transformers.
            More detail here: https://arxiv.org/pdf/2004.01461.pdf
        """
        super().__init__(
            graph=graph,
            embedding_size=embedding_size,
            distance_metric=distance_metric,
            embedding=embedding,
            extra_features=extra_features,
            model_name=model_name,
            optimizer=optimizer,
            support_mirrored_strategy=support_mirrored_strategy,
            use_gradient_centralization=use_gradient_centralization
        )

    def _build_output(
        self,
        source_node_embedding: tf.Tensor,
        destination_node_embedding: tf.Tensor,
        edge_type_embedding: Optional[tf.Tensor] = None,
        edge_types_input: Optional[tf.Tensor] = None,
    ):
        """Return output of the model."""
        normal_edge_type_embedding = Embedding(
            input_dim=self._edge_types_number,
            output_dim=self._edge_type_embedding_size,
            input_length=1,
            name="normal_edge_type_embedding_layer",
        )(edge_types_input)

        normal_edge_type_embedding = UnitNorm(
            axis=-1
        )(normal_edge_type_embedding)

        source_node_embedding -= tf.transpose(
            normal_edge_type_embedding,
            perm=[0, 2, 1]
        ) * source_node_embedding * normal_edge_type_embedding

        destination_node_embedding -= tf.transpose(
            normal_edge_type_embedding,
            perm=[0, 2, 1]
        ) * destination_node_embedding * normal_edge_type_embedding

        return super()._build_output(
            source_node_embedding,
            destination_node_embedding,
            edge_type_embedding,
            edge_types_input
        )

    def fit(
        self,
        batch_size: int = 2**15,
        negative_samples_rate: float = 0.5,
        avoid_false_negatives: bool = False,
        graph_to_avoid: Graph = None,
        batches_per_epoch: Union[int, str] = "auto",
        elapsed_epochs: int = 0,
        epochs: int = 1000,
        early_stopping_monitor: str = "loss",
        early_stopping_min_delta: float = 0.01,
        early_stopping_patience: int = 5,
        early_stopping_mode: str = "min",
        reduce_lr_monitor: str = "loss",
        reduce_lr_min_delta: float = 0.01,
        reduce_lr_patience: int = 2,
        reduce_lr_mode: str = "min",
        reduce_lr_factor: float = 0.9,
        verbose: int = 2,
        **kwargs: Dict
    ) -> pd.DataFrame:
        """Return pandas dataframe with training history.

        Parameters
        -----------------------
        graph: Graph,
            Graph to embed.
        epochs: int = 100,
            Epochs to train the model for.
        early_stopping_monitor: str = "loss",
            Metric to monitor for early stopping.
        early_stopping_min_delta: float = 0.1,
            Minimum delta of metric to stop the training.
        early_stopping_patience: int = 5,
            Number of epochs to wait for when the given minimum delta is not
            achieved after which trigger early stopping.
        early_stopping_mode: str = "min",
            Direction of the variation of the monitored metric for early stopping.
        reduce_lr_monitor: str = "loss",
            Metric to monitor for reducing learning rate.
        reduce_lr_min_delta: float = 1,
            Minimum delta of metric to reduce learning rate.
        reduce_lr_patience: int = 3,
            Number of epochs to wait for when the given minimum delta is not
            achieved after which reducing learning rate.
        reduce_lr_mode: str = "min",
            Direction of the variation of the monitored metric for learning rate.
        reduce_lr_factor: float = 0.9,
            Factor for reduction of learning rate.
        verbose: int = 2,
            Wethever to show the loading bar.
            Specifically, the options are:
            * 0 or False: No loading bar.
            * 1 or True: Showing only the loading bar for the epochs.
            * 2: Showing loading bar for both epochs and batches.
        **kwargs: Dict,
            Additional kwargs to pass to the Keras fit call.

        Returns
        -----------------------
        Dataframe with training history.
        """
        return super().fit(
            batch_size=batch_size,
            avoid_false_negatives=avoid_false_negatives,
            graph_to_avoid=graph_to_avoid,
            negative_samples_rate=negative_samples_rate,
            elapsed_epochs=elapsed_epochs,
            batches_per_epoch=batches_per_epoch,
            epochs=epochs,
            early_stopping_monitor=early_stopping_monitor,
            early_stopping_min_delta=early_stopping_min_delta,
            early_stopping_patience=early_stopping_patience,
            early_stopping_mode=early_stopping_mode,
            reduce_lr_monitor=reduce_lr_monitor,
            reduce_lr_min_delta=reduce_lr_min_delta,
            reduce_lr_patience=reduce_lr_patience,
            reduce_lr_mode=reduce_lr_mode,
            reduce_lr_factor=reduce_lr_factor,
            verbose=verbose,
            **kwargs
        )

Siamese network for node-embedding including optionally node types and edge types.

#   TransH( graph: Graph, embedding_size: int = 100, distance_metric: str = 'COSINE', embedding: Union[numpy.ndarray, pandas.core.frame.DataFrame] = None, extra_features: Union[numpy.ndarray, pandas.core.frame.DataFrame] = None, model_name: str = 'TransH', optimizer: Union[str, keras.optimizer_v2.optimizer_v2.OptimizerV2] = None, support_mirrored_strategy: bool = False, use_gradient_centralization: str = 'auto' )
View Source
    def __init__(
        self,
        graph: Graph,
        embedding_size: int = 100,
        distance_metric: str = "COSINE",
        embedding: Union[np.ndarray, pd.DataFrame] = None,
        extra_features: Union[np.ndarray, pd.DataFrame] = None,
        model_name: str = "TransH",
        optimizer: Union[str, Optimizer] = None,
        support_mirrored_strategy: bool = False,
        use_gradient_centralization: str = "auto"
    ):
        """Create new sequence Embedder model.

        Parameters
        -------------------------------------------
        vocabulary_size: int = None,
            Number of terms to embed.
            In a graph this is the number of nodes, while in a text is the
            number of the unique words.
            If None, the seed embedding must be provided.
            It is not possible to provide both at once.
        embedding_size: int = 100,
            Dimension of the embedding.
            If None, the seed embedding must be provided.
            It is not possible to provide both at once.
        distance_metric: str = "COSINE",
            The distance to use for the loss function.
            Supported methods are L1, L2 and COSINE.
        embedding: Union[np.ndarray, pd.DataFrame] = None,
            The seed embedding to be used.
            Note that it is not possible to provide at once both
            the embedding and either the vocabulary size or the embedding size.
        extra_features: Union[np.ndarray, pd.DataFrame] = None,
            Optional extra features to be used during the computation
            of the embedding. The features must be available for all the
            elements considered for the embedding.
        model_name: str = "TransH",
            Name of the model.
        optimizer: Union[str, Optimizer] = "nadam",
            The optimizer to be used during the training of the model.
        support_mirrored_strategy: bool = False,
            Wethever to patch support for mirror strategy.
            At the time of writing, TensorFlow's MirrorStrategy does not support
            input values different from floats, therefore to support it we need
            to convert the unsigned int 32 values that represent the indices of
            the embedding layers we receive from Ensmallen to floats.
            This will generally slow down performance, but in the context of
            exploiting multiple GPUs it may be unnoticeable.
        use_gradient_centralization: bool = True,
            Whether to wrap the provided optimizer into a normalized
            one that centralizes the gradient.
            It is automatically enabled if the current version of
            TensorFlow supports gradient transformers.
            More detail here: https://arxiv.org/pdf/2004.01461.pdf
        """
        super().__init__(
            graph=graph,
            embedding_size=embedding_size,
            distance_metric=distance_metric,
            embedding=embedding,
            extra_features=extra_features,
            model_name=model_name,
            optimizer=optimizer,
            support_mirrored_strategy=support_mirrored_strategy,
            use_gradient_centralization=use_gradient_centralization
        )

Create new sequence Embedder model.

Parameters
  • vocabulary_size (int = None,): Number of terms to embed. In a graph this is the number of nodes, while in a text is the number of the unique words. If None, the seed embedding must be provided. It is not possible to provide both at once.
  • embedding_size (int = 100,): Dimension of the embedding. If None, the seed embedding must be provided. It is not possible to provide both at once.
  • distance_metric (str = "COSINE",): The distance to use for the loss function. Supported methods are L1, L2 and COSINE.
  • embedding (Union[np.ndarray, pd.DataFrame] = None,): The seed embedding to be used. Note that it is not possible to provide at once both the embedding and either the vocabulary size or the embedding size.
  • extra_features (Union[np.ndarray, pd.DataFrame] = None,): Optional extra features to be used during the computation of the embedding. The features must be available for all the elements considered for the embedding.
  • model_name (str = "TransH",): Name of the model.
  • optimizer (Union[str, Optimizer] = "nadam",): The optimizer to be used during the training of the model.
  • support_mirrored_strategy (bool = False,): Wethever to patch support for mirror strategy. At the time of writing, TensorFlow's MirrorStrategy does not support input values different from floats, therefore to support it we need to convert the unsigned int 32 values that represent the indices of the embedding layers we receive from Ensmallen to floats. This will generally slow down performance, but in the context of exploiting multiple GPUs it may be unnoticeable.
  • use_gradient_centralization (bool = True,): Whether to wrap the provided optimizer into a normalized one that centralizes the gradient. It is automatically enabled if the current version of TensorFlow supports gradient transformers. More detail here: https://arxiv.org/pdf/2004.01461.pdf
#   def fit( self, batch_size: int = 32768, negative_samples_rate: float = 0.5, avoid_false_negatives: bool = False, graph_to_avoid: Graph = None, batches_per_epoch: Union[int, str] = 'auto', elapsed_epochs: int = 0, epochs: int = 1000, early_stopping_monitor: str = 'loss', early_stopping_min_delta: float = 0.01, early_stopping_patience: int = 5, early_stopping_mode: str = 'min', reduce_lr_monitor: str = 'loss', reduce_lr_min_delta: float = 0.01, reduce_lr_patience: int = 2, reduce_lr_mode: str = 'min', reduce_lr_factor: float = 0.9, verbose: int = 2, **kwargs: Dict ) -> pandas.core.frame.DataFrame:
View Source
    def fit(
        self,
        batch_size: int = 2**15,
        negative_samples_rate: float = 0.5,
        avoid_false_negatives: bool = False,
        graph_to_avoid: Graph = None,
        batches_per_epoch: Union[int, str] = "auto",
        elapsed_epochs: int = 0,
        epochs: int = 1000,
        early_stopping_monitor: str = "loss",
        early_stopping_min_delta: float = 0.01,
        early_stopping_patience: int = 5,
        early_stopping_mode: str = "min",
        reduce_lr_monitor: str = "loss",
        reduce_lr_min_delta: float = 0.01,
        reduce_lr_patience: int = 2,
        reduce_lr_mode: str = "min",
        reduce_lr_factor: float = 0.9,
        verbose: int = 2,
        **kwargs: Dict
    ) -> pd.DataFrame:
        """Return pandas dataframe with training history.

        Parameters
        -----------------------
        graph: Graph,
            Graph to embed.
        epochs: int = 100,
            Epochs to train the model for.
        early_stopping_monitor: str = "loss",
            Metric to monitor for early stopping.
        early_stopping_min_delta: float = 0.1,
            Minimum delta of metric to stop the training.
        early_stopping_patience: int = 5,
            Number of epochs to wait for when the given minimum delta is not
            achieved after which trigger early stopping.
        early_stopping_mode: str = "min",
            Direction of the variation of the monitored metric for early stopping.
        reduce_lr_monitor: str = "loss",
            Metric to monitor for reducing learning rate.
        reduce_lr_min_delta: float = 1,
            Minimum delta of metric to reduce learning rate.
        reduce_lr_patience: int = 3,
            Number of epochs to wait for when the given minimum delta is not
            achieved after which reducing learning rate.
        reduce_lr_mode: str = "min",
            Direction of the variation of the monitored metric for learning rate.
        reduce_lr_factor: float = 0.9,
            Factor for reduction of learning rate.
        verbose: int = 2,
            Wethever to show the loading bar.
            Specifically, the options are:
            * 0 or False: No loading bar.
            * 1 or True: Showing only the loading bar for the epochs.
            * 2: Showing loading bar for both epochs and batches.
        **kwargs: Dict,
            Additional kwargs to pass to the Keras fit call.

        Returns
        -----------------------
        Dataframe with training history.
        """
        return super().fit(
            batch_size=batch_size,
            avoid_false_negatives=avoid_false_negatives,
            graph_to_avoid=graph_to_avoid,
            negative_samples_rate=negative_samples_rate,
            elapsed_epochs=elapsed_epochs,
            batches_per_epoch=batches_per_epoch,
            epochs=epochs,
            early_stopping_monitor=early_stopping_monitor,
            early_stopping_min_delta=early_stopping_min_delta,
            early_stopping_patience=early_stopping_patience,
            early_stopping_mode=early_stopping_mode,
            reduce_lr_monitor=reduce_lr_monitor,
            reduce_lr_min_delta=reduce_lr_min_delta,
            reduce_lr_patience=reduce_lr_patience,
            reduce_lr_mode=reduce_lr_mode,
            reduce_lr_factor=reduce_lr_factor,
            verbose=verbose,
            **kwargs
        )

Return pandas dataframe with training history.

Parameters
  • graph (Graph,): Graph to embed.
  • epochs (int = 100,): Epochs to train the model for.
  • early_stopping_monitor (str = "loss",): Metric to monitor for early stopping.
  • early_stopping_min_delta (float = 0.1,): Minimum delta of metric to stop the training.
  • early_stopping_patience (int = 5,): Number of epochs to wait for when the given minimum delta is not achieved after which trigger early stopping.
  • early_stopping_mode (str = "min",): Direction of the variation of the monitored metric for early stopping.
  • reduce_lr_monitor (str = "loss",): Metric to monitor for reducing learning rate.
  • reduce_lr_min_delta (float = 1,): Minimum delta of metric to reduce learning rate.
  • reduce_lr_patience (int = 3,): Number of epochs to wait for when the given minimum delta is not achieved after which reducing learning rate.
  • reduce_lr_mode (str = "min",): Direction of the variation of the monitored metric for learning rate.
  • reduce_lr_factor (float = 0.9,): Factor for reduction of learning rate.
  • verbose (int = 2,): Wethever to show the loading bar. Specifically, the options are:
    • 0 or False: No loading bar.
    • 1 or True: Showing only the loading bar for the epochs.
    • 2: Showing loading bar for both epochs and batches.
  • **kwargs (Dict,): Additional kwargs to pass to the Keras fit call.
Returns
  • Dataframe with training history.
Inherited Members
Siamese
NODE_TYPE_EMBEDDING_LAYER_NAME
EDGE_TYPE_EMBEDDING_LAYER_NAME
get_embedding_dataframe
embedding
grape.embiggen.embedders.embedder.Embedder
TERMS_EMBEDDING_LAYER_NAME
trainable
summary
get_layer_weights
save_embedding
name
save_weights
load_weights
#   class TransR(grape.embiggen.TransE):
View Source
class TransR(TransE):
    """Siamese network for node-embedding including optionally node types and edge types."""

    def __init__(
        self,
        graph: Graph,
        embedding_size: int = 100,
        distance_metric: str = "COSINE",
        embedding: Union[np.ndarray, pd.DataFrame] = None,
        extra_features: Union[np.ndarray, pd.DataFrame] = None,
        model_name: str = "TransR",
        optimizer: Union[str, Optimizer] = None,
        support_mirrored_strategy: bool = False,
        use_gradient_centralization: str = "auto"
    ):
        """Create new sequence Embedder model.

        Parameters
        -------------------------------------------
        vocabulary_size: int = None,
            Number of terms to embed.
            In a graph this is the number of nodes, while in a text is the
            number of the unique words.
            If None, the seed embedding must be provided.
            It is not possible to provide both at once.
        embedding_size: int = 100,
            Dimension of the embedding.
            If None, the seed embedding must be provided.
            It is not possible to provide both at once.
        distance_metric: str = "COSINE",
            The distance to use for the loss function.
            Supported methods are L1, L2 and COSINE.
        embedding: Union[np.ndarray, pd.DataFrame] = None,
            The seed embedding to be used.
            Note that it is not possible to provide at once both
            the embedding and either the vocabulary size or the embedding size.
        extra_features: Union[np.ndarray, pd.DataFrame] = None,
            Optional extra features to be used during the computation
            of the embedding. The features must be available for all the
            elements considered for the embedding.
        model_name: str = "TransR",
            Name of the model.
        optimizer: Union[str, Optimizer] = "nadam",
            The optimizer to be used during the training of the model.
        support_mirrored_strategy: bool = False,
            Wethever to patch support for mirror strategy.
            At the time of writing, TensorFlow's MirrorStrategy does not support
            input values different from floats, therefore to support it we need
            to convert the unsigned int 32 values that represent the indices of
            the embedding layers we receive from Ensmallen to floats.
            This will generally slow down performance, but in the context of
            exploiting multiple GPUs it may be unnoticeable.
        use_gradient_centralization: bool = True,
            Whether to wrap the provided optimizer into a normalized
            one that centralizes the gradient.
            It is automatically enabled if the current version of
            TensorFlow supports gradient transformers.
            More detail here: https://arxiv.org/pdf/2004.01461.pdf
        """
        super().__init__(
            graph=graph,
            embedding_size=embedding_size,
            distance_metric=distance_metric,
            embedding=embedding,
            extra_features=extra_features,
            model_name=model_name,
            optimizer=optimizer,
            support_mirrored_strategy=support_mirrored_strategy,
            use_gradient_centralization=use_gradient_centralization
        )

    def _build_output(
        self,
        source_node_embedding: tf.Tensor,
        destination_node_embedding: tf.Tensor,
        edge_type_embedding: Optional[tf.Tensor] = None,
        edge_types_input: Optional[tf.Tensor] = None,
    ):
        """Return output of the model."""
        normal_edge_type_embedding = Embedding(
            input_dim=self._edge_types_number,
            output_dim=self._edge_type_embedding_size*self._embedding_size,
            input_length=1,
            name="normal_edge_type_embedding_layer",
        )(edge_types_input)

        normal_edge_type_embedding_matrix = Reshape((
            self._edge_type_embedding_size,
            self._embedding_size
        ))(normal_edge_type_embedding)

        source_node_embedding = K.l2_normalize(
            normal_edge_type_embedding_matrix * source_node_embedding,
            axis=-1
        )
        destination_node_embedding = K.l2_normalize(
            normal_edge_type_embedding_matrix * destination_node_embedding,
            axis=-1
        )

        return super()._build_output(
            source_node_embedding,
            destination_node_embedding,
            edge_type_embedding,
            edge_types_input
        )

    def fit(
        self,
        batch_size: int = 2**15,
        negative_samples_rate: float = 0.5,
        avoid_false_negatives: bool = False,
        graph_to_avoid: Graph = None,
        batches_per_epoch: Union[int, str] = "auto",
        elapsed_epochs: int = 0,
        epochs: int = 1000,
        early_stopping_monitor: str = "loss",
        early_stopping_min_delta: float = 0.01,
        early_stopping_patience: int = 5,
        early_stopping_mode: str = "min",
        reduce_lr_monitor: str = "loss",
        reduce_lr_min_delta: float = 0.01,
        reduce_lr_patience: int = 2,
        reduce_lr_mode: str = "min",
        reduce_lr_factor: float = 0.9,
        verbose: int = 2,
        **kwargs: Dict
    ) -> pd.DataFrame:
        """Return pandas dataframe with training history.

        Parameters
        -----------------------
        graph: Graph,
            Graph to embed.
        epochs: int = 100,
            Epochs to train the model for.
        early_stopping_monitor: str = "loss",
            Metric to monitor for early stopping.
        early_stopping_min_delta: float = 0.1,
            Minimum delta of metric to stop the training.
        early_stopping_patience: int = 5,
            Number of epochs to wait for when the given minimum delta is not
            achieved after which trigger early stopping.
        early_stopping_mode: str = "min",
            Direction of the variation of the monitored metric for early stopping.
        reduce_lr_monitor: str = "loss",
            Metric to monitor for reducing learning rate.
        reduce_lr_min_delta: float = 1,
            Minimum delta of metric to reduce learning rate.
        reduce_lr_patience: int = 3,
            Number of epochs to wait for when the given minimum delta is not
            achieved after which reducing learning rate.
        reduce_lr_mode: str = "min",
            Direction of the variation of the monitored metric for learning rate.
        reduce_lr_factor: float = 0.9,
            Factor for reduction of learning rate.
        verbose: int = 2,
            Wethever to show the loading bar.
            Specifically, the options are:
            * 0 or False: No loading bar.
            * 1 or True: Showing only the loading bar for the epochs.
            * 2: Showing loading bar for both epochs and batches.
        **kwargs: Dict,
            Additional kwargs to pass to the Keras fit call.

        Returns
        -----------------------
        Dataframe with training history.
        """
        return super().fit(
            batch_size=batch_size,
            avoid_false_negatives=avoid_false_negatives,
            graph_to_avoid=graph_to_avoid,
            negative_samples_rate=negative_samples_rate,
            elapsed_epochs=elapsed_epochs,
            batches_per_epoch=batches_per_epoch,
            epochs=epochs,
            early_stopping_monitor=early_stopping_monitor,
            early_stopping_min_delta=early_stopping_min_delta,
            early_stopping_patience=early_stopping_patience,
            early_stopping_mode=early_stopping_mode,
            reduce_lr_monitor=reduce_lr_monitor,
            reduce_lr_min_delta=reduce_lr_min_delta,
            reduce_lr_patience=reduce_lr_patience,
            reduce_lr_mode=reduce_lr_mode,
            reduce_lr_factor=reduce_lr_factor,
            verbose=verbose,
            **kwargs
        )

Siamese network for node-embedding including optionally node types and edge types.

#   TransR( graph: Graph, embedding_size: int = 100, distance_metric: str = 'COSINE', embedding: Union[numpy.ndarray, pandas.core.frame.DataFrame] = None, extra_features: Union[numpy.ndarray, pandas.core.frame.DataFrame] = None, model_name: str = 'TransR', optimizer: Union[str, keras.optimizer_v2.optimizer_v2.OptimizerV2] = None, support_mirrored_strategy: bool = False, use_gradient_centralization: str = 'auto' )
View Source
    def __init__(
        self,
        graph: Graph,
        embedding_size: int = 100,
        distance_metric: str = "COSINE",
        embedding: Union[np.ndarray, pd.DataFrame] = None,
        extra_features: Union[np.ndarray, pd.DataFrame] = None,
        model_name: str = "TransR",
        optimizer: Union[str, Optimizer] = None,
        support_mirrored_strategy: bool = False,
        use_gradient_centralization: str = "auto"
    ):
        """Create new sequence Embedder model.

        Parameters
        -------------------------------------------
        vocabulary_size: int = None,
            Number of terms to embed.
            In a graph this is the number of nodes, while in a text is the
            number of the unique words.
            If None, the seed embedding must be provided.
            It is not possible to provide both at once.
        embedding_size: int = 100,
            Dimension of the embedding.
            If None, the seed embedding must be provided.
            It is not possible to provide both at once.
        distance_metric: str = "COSINE",
            The distance to use for the loss function.
            Supported methods are L1, L2 and COSINE.
        embedding: Union[np.ndarray, pd.DataFrame] = None,
            The seed embedding to be used.
            Note that it is not possible to provide at once both
            the embedding and either the vocabulary size or the embedding size.
        extra_features: Union[np.ndarray, pd.DataFrame] = None,
            Optional extra features to be used during the computation
            of the embedding. The features must be available for all the
            elements considered for the embedding.
        model_name: str = "TransR",
            Name of the model.
        optimizer: Union[str, Optimizer] = "nadam",
            The optimizer to be used during the training of the model.
        support_mirrored_strategy: bool = False,
            Wethever to patch support for mirror strategy.
            At the time of writing, TensorFlow's MirrorStrategy does not support
            input values different from floats, therefore to support it we need
            to convert the unsigned int 32 values that represent the indices of
            the embedding layers we receive from Ensmallen to floats.
            This will generally slow down performance, but in the context of
            exploiting multiple GPUs it may be unnoticeable.
        use_gradient_centralization: bool = True,
            Whether to wrap the provided optimizer into a normalized
            one that centralizes the gradient.
            It is automatically enabled if the current version of
            TensorFlow supports gradient transformers.
            More detail here: https://arxiv.org/pdf/2004.01461.pdf
        """
        super().__init__(
            graph=graph,
            embedding_size=embedding_size,
            distance_metric=distance_metric,
            embedding=embedding,
            extra_features=extra_features,
            model_name=model_name,
            optimizer=optimizer,
            support_mirrored_strategy=support_mirrored_strategy,
            use_gradient_centralization=use_gradient_centralization
        )

Create new sequence Embedder model.

Parameters
  • vocabulary_size (int = None,): Number of terms to embed. In a graph this is the number of nodes, while in a text is the number of the unique words. If None, the seed embedding must be provided. It is not possible to provide both at once.
  • embedding_size (int = 100,): Dimension of the embedding. If None, the seed embedding must be provided. It is not possible to provide both at once.
  • distance_metric (str = "COSINE",): The distance to use for the loss function. Supported methods are L1, L2 and COSINE.
  • embedding (Union[np.ndarray, pd.DataFrame] = None,): The seed embedding to be used. Note that it is not possible to provide at once both the embedding and either the vocabulary size or the embedding size.
  • extra_features (Union[np.ndarray, pd.DataFrame] = None,): Optional extra features to be used during the computation of the embedding. The features must be available for all the elements considered for the embedding.
  • model_name (str = "TransR",): Name of the model.
  • optimizer (Union[str, Optimizer] = "nadam",): The optimizer to be used during the training of the model.
  • support_mirrored_strategy (bool = False,): Wethever to patch support for mirror strategy. At the time of writing, TensorFlow's MirrorStrategy does not support input values different from floats, therefore to support it we need to convert the unsigned int 32 values that represent the indices of the embedding layers we receive from Ensmallen to floats. This will generally slow down performance, but in the context of exploiting multiple GPUs it may be unnoticeable.
  • use_gradient_centralization (bool = True,): Whether to wrap the provided optimizer into a normalized one that centralizes the gradient. It is automatically enabled if the current version of TensorFlow supports gradient transformers. More detail here: https://arxiv.org/pdf/2004.01461.pdf
#   def fit( self, batch_size: int = 32768, negative_samples_rate: float = 0.5, avoid_false_negatives: bool = False, graph_to_avoid: Graph = None, batches_per_epoch: Union[int, str] = 'auto', elapsed_epochs: int = 0, epochs: int = 1000, early_stopping_monitor: str = 'loss', early_stopping_min_delta: float = 0.01, early_stopping_patience: int = 5, early_stopping_mode: str = 'min', reduce_lr_monitor: str = 'loss', reduce_lr_min_delta: float = 0.01, reduce_lr_patience: int = 2, reduce_lr_mode: str = 'min', reduce_lr_factor: float = 0.9, verbose: int = 2, **kwargs: Dict ) -> pandas.core.frame.DataFrame:
View Source
    def fit(
        self,
        batch_size: int = 2**15,
        negative_samples_rate: float = 0.5,
        avoid_false_negatives: bool = False,
        graph_to_avoid: Graph = None,
        batches_per_epoch: Union[int, str] = "auto",
        elapsed_epochs: int = 0,
        epochs: int = 1000,
        early_stopping_monitor: str = "loss",
        early_stopping_min_delta: float = 0.01,
        early_stopping_patience: int = 5,
        early_stopping_mode: str = "min",
        reduce_lr_monitor: str = "loss",
        reduce_lr_min_delta: float = 0.01,
        reduce_lr_patience: int = 2,
        reduce_lr_mode: str = "min",
        reduce_lr_factor: float = 0.9,
        verbose: int = 2,
        **kwargs: Dict
    ) -> pd.DataFrame:
        """Return pandas dataframe with training history.

        Parameters
        -----------------------
        graph: Graph,
            Graph to embed.
        epochs: int = 100,
            Epochs to train the model for.
        early_stopping_monitor: str = "loss",
            Metric to monitor for early stopping.
        early_stopping_min_delta: float = 0.1,
            Minimum delta of metric to stop the training.
        early_stopping_patience: int = 5,
            Number of epochs to wait for when the given minimum delta is not
            achieved after which trigger early stopping.
        early_stopping_mode: str = "min",
            Direction of the variation of the monitored metric for early stopping.
        reduce_lr_monitor: str = "loss",
            Metric to monitor for reducing learning rate.
        reduce_lr_min_delta: float = 1,
            Minimum delta of metric to reduce learning rate.
        reduce_lr_patience: int = 3,
            Number of epochs to wait for when the given minimum delta is not
            achieved after which reducing learning rate.
        reduce_lr_mode: str = "min",
            Direction of the variation of the monitored metric for learning rate.
        reduce_lr_factor: float = 0.9,
            Factor for reduction of learning rate.
        verbose: int = 2,
            Wethever to show the loading bar.
            Specifically, the options are:
            * 0 or False: No loading bar.
            * 1 or True: Showing only the loading bar for the epochs.
            * 2: Showing loading bar for both epochs and batches.
        **kwargs: Dict,
            Additional kwargs to pass to the Keras fit call.

        Returns
        -----------------------
        Dataframe with training history.
        """
        return super().fit(
            batch_size=batch_size,
            avoid_false_negatives=avoid_false_negatives,
            graph_to_avoid=graph_to_avoid,
            negative_samples_rate=negative_samples_rate,
            elapsed_epochs=elapsed_epochs,
            batches_per_epoch=batches_per_epoch,
            epochs=epochs,
            early_stopping_monitor=early_stopping_monitor,
            early_stopping_min_delta=early_stopping_min_delta,
            early_stopping_patience=early_stopping_patience,
            early_stopping_mode=early_stopping_mode,
            reduce_lr_monitor=reduce_lr_monitor,
            reduce_lr_min_delta=reduce_lr_min_delta,
            reduce_lr_patience=reduce_lr_patience,
            reduce_lr_mode=reduce_lr_mode,
            reduce_lr_factor=reduce_lr_factor,
            verbose=verbose,
            **kwargs
        )

Return pandas dataframe with training history.

Parameters
  • graph (Graph,): Graph to embed.
  • epochs (int = 100,): Epochs to train the model for.
  • early_stopping_monitor (str = "loss",): Metric to monitor for early stopping.
  • early_stopping_min_delta (float = 0.1,): Minimum delta of metric to stop the training.
  • early_stopping_patience (int = 5,): Number of epochs to wait for when the given minimum delta is not achieved after which trigger early stopping.
  • early_stopping_mode (str = "min",): Direction of the variation of the monitored metric for early stopping.
  • reduce_lr_monitor (str = "loss",): Metric to monitor for reducing learning rate.
  • reduce_lr_min_delta (float = 1,): Minimum delta of metric to reduce learning rate.
  • reduce_lr_patience (int = 3,): Number of epochs to wait for when the given minimum delta is not achieved after which reducing learning rate.
  • reduce_lr_mode (str = "min",): Direction of the variation of the monitored metric for learning rate.
  • reduce_lr_factor (float = 0.9,): Factor for reduction of learning rate.
  • verbose (int = 2,): Wethever to show the loading bar. Specifically, the options are:
    • 0 or False: No loading bar.
    • 1 or True: Showing only the loading bar for the epochs.
    • 2: Showing loading bar for both epochs and batches.
  • **kwargs (Dict,): Additional kwargs to pass to the Keras fit call.
Returns
  • Dataframe with training history.
Inherited Members
Siamese
NODE_TYPE_EMBEDDING_LAYER_NAME
EDGE_TYPE_EMBEDDING_LAYER_NAME
get_embedding_dataframe
embedding
grape.embiggen.embedders.embedder.Embedder
TERMS_EMBEDDING_LAYER_NAME
trainable
summary
get_layer_weights
save_embedding
name
save_weights
load_weights
#   class SimplE(grape.embiggen.Siamese):
View Source
class SimplE(Siamese):
    """Siamese network for node-embedding including optionally node types and edge types."""

    def __init__(
        self,
        graph: Graph,
        embedding_size: int = 100,
        embedding: Union[np.ndarray, pd.DataFrame] = None,
        extra_features: Union[np.ndarray, pd.DataFrame] = None,
        model_name: str = "SimplE",
        optimizer: Union[str, Optimizer] = None,
        support_mirrored_strategy: bool = False,
        use_gradient_centralization: str = "auto"
    ):
        """Create new sequence Embedder model.

        Parameters
        -------------------------------------------
        vocabulary_size: int = None,
            Number of terms to embed.
            In a graph this is the number of nodes, while in a text is the
            number of the unique words.
            If None, the seed embedding must be provided.
            It is not possible to provide both at once.
        embedding_size: int = 100,
            Dimension of the embedding.
            If None, the seed embedding must be provided.
            It is not possible to provide both at once.
        embedding: Union[np.ndarray, pd.DataFrame] = None,
            The seed embedding to be used.
            Note that it is not possible to provide at once both
            the embedding and either the vocabulary size or the embedding size.
        extra_features: Union[np.ndarray, pd.DataFrame] = None,
            Optional extra features to be used during the computation
            of the embedding. The features must be available for all the
            elements considered for the embedding.
        model_name: str = "SimplE",
            Name of the model.
        optimizer: Union[str, Optimizer] = "nadam",
            The optimizer to be used during the training of the model.
        support_mirrored_strategy: bool = False,
            Wethever to patch support for mirror strategy.
            At the time of writing, TensorFlow's MirrorStrategy does not support
            input values different from floats, therefore to support it we need
            to convert the unsigned int 32 values that represent the indices of
            the embedding layers we receive from Ensmallen to floats.
            This will generally slow down performance, but in the context of
            exploiting multiple GPUs it may be unnoticeable.
        use_gradient_centralization: bool = True,
            Whether to wrap the provided optimizer into a normalized
            one that centralizes the gradient.
            It is automatically enabled if the current version of
            TensorFlow supports gradient transformers.
            More detail here: https://arxiv.org/pdf/2004.01461.pdf
        """
        super().__init__(
            graph=graph,
            use_node_types=False,
            use_edge_types=True,
            node_embedding_size=embedding_size,
            edge_type_embedding_size=embedding_size,
            embedding=embedding,
            extra_features=extra_features,
            model_name=model_name,
            optimizer=optimizer,
            support_mirrored_strategy=support_mirrored_strategy,
            use_gradient_centralization=use_gradient_centralization
        )

    def _build_output(
        self,
        source_node_embedding: tf.Tensor,
        destination_node_embedding: tf.Tensor,
        edge_type_embedding: Optional[tf.Tensor] = None,
        edge_types_input: Optional[tf.Tensor] = None,
    ):
        """Return output of the model."""
        reverse_edge_type_embedding = Embedding(
            input_dim=self._edge_types_number,
            output_dim=self._edge_type_embedding_size,
            input_length=1,
            name="reverse_edge_type_embedding_layer",
        )(edge_types_input)
        reverse_edge_type_embedding = UnitNorm(
            axis=-1
        )(reverse_edge_type_embedding)
        return K.sum(
            source_node_embedding * edge_type_embedding * destination_node_embedding,
            axis=-1
        ) + K.sum(
            destination_node_embedding * reverse_edge_type_embedding * source_node_embedding,
            axis=-1
        )

Siamese network for node-embedding including optionally node types and edge types.

#   SimplE( graph: Graph, embedding_size: int = 100, embedding: Union[numpy.ndarray, pandas.core.frame.DataFrame] = None, extra_features: Union[numpy.ndarray, pandas.core.frame.DataFrame] = None, model_name: str = 'SimplE', optimizer: Union[str, keras.optimizer_v2.optimizer_v2.OptimizerV2] = None, support_mirrored_strategy: bool = False, use_gradient_centralization: str = 'auto' )
View Source
    def __init__(
        self,
        graph: Graph,
        embedding_size: int = 100,
        embedding: Union[np.ndarray, pd.DataFrame] = None,
        extra_features: Union[np.ndarray, pd.DataFrame] = None,
        model_name: str = "SimplE",
        optimizer: Union[str, Optimizer] = None,
        support_mirrored_strategy: bool = False,
        use_gradient_centralization: str = "auto"
    ):
        """Create new sequence Embedder model.

        Parameters
        -------------------------------------------
        vocabulary_size: int = None,
            Number of terms to embed.
            In a graph this is the number of nodes, while in a text is the
            number of the unique words.
            If None, the seed embedding must be provided.
            It is not possible to provide both at once.
        embedding_size: int = 100,
            Dimension of the embedding.
            If None, the seed embedding must be provided.
            It is not possible to provide both at once.
        embedding: Union[np.ndarray, pd.DataFrame] = None,
            The seed embedding to be used.
            Note that it is not possible to provide at once both
            the embedding and either the vocabulary size or the embedding size.
        extra_features: Union[np.ndarray, pd.DataFrame] = None,
            Optional extra features to be used during the computation
            of the embedding. The features must be available for all the
            elements considered for the embedding.
        model_name: str = "SimplE",
            Name of the model.
        optimizer: Union[str, Optimizer] = "nadam",
            The optimizer to be used during the training of the model.
        support_mirrored_strategy: bool = False,
            Wethever to patch support for mirror strategy.
            At the time of writing, TensorFlow's MirrorStrategy does not support
            input values different from floats, therefore to support it we need
            to convert the unsigned int 32 values that represent the indices of
            the embedding layers we receive from Ensmallen to floats.
            This will generally slow down performance, but in the context of
            exploiting multiple GPUs it may be unnoticeable.
        use_gradient_centralization: bool = True,
            Whether to wrap the provided optimizer into a normalized
            one that centralizes the gradient.
            It is automatically enabled if the current version of
            TensorFlow supports gradient transformers.
            More detail here: https://arxiv.org/pdf/2004.01461.pdf
        """
        super().__init__(
            graph=graph,
            use_node_types=False,
            use_edge_types=True,
            node_embedding_size=embedding_size,
            edge_type_embedding_size=embedding_size,
            embedding=embedding,
            extra_features=extra_features,
            model_name=model_name,
            optimizer=optimizer,
            support_mirrored_strategy=support_mirrored_strategy,
            use_gradient_centralization=use_gradient_centralization
        )

Create new sequence Embedder model.

Parameters
  • vocabulary_size (int = None,): Number of terms to embed. In a graph this is the number of nodes, while in a text is the number of the unique words. If None, the seed embedding must be provided. It is not possible to provide both at once.
  • embedding_size (int = 100,): Dimension of the embedding. If None, the seed embedding must be provided. It is not possible to provide both at once.
  • embedding (Union[np.ndarray, pd.DataFrame] = None,): The seed embedding to be used. Note that it is not possible to provide at once both the embedding and either the vocabulary size or the embedding size.
  • extra_features (Union[np.ndarray, pd.DataFrame] = None,): Optional extra features to be used during the computation of the embedding. The features must be available for all the elements considered for the embedding.
  • model_name (str = "SimplE",): Name of the model.
  • optimizer (Union[str, Optimizer] = "nadam",): The optimizer to be used during the training of the model.
  • support_mirrored_strategy (bool = False,): Wethever to patch support for mirror strategy. At the time of writing, TensorFlow's MirrorStrategy does not support input values different from floats, therefore to support it we need to convert the unsigned int 32 values that represent the indices of the embedding layers we receive from Ensmallen to floats. This will generally slow down performance, but in the context of exploiting multiple GPUs it may be unnoticeable.
  • use_gradient_centralization (bool = True,): Whether to wrap the provided optimizer into a normalized one that centralizes the gradient. It is automatically enabled if the current version of TensorFlow supports gradient transformers. More detail here: https://arxiv.org/pdf/2004.01461.pdf
Inherited Members
Siamese
NODE_TYPE_EMBEDDING_LAYER_NAME
EDGE_TYPE_EMBEDDING_LAYER_NAME
get_embedding_dataframe
embedding
fit
grape.embiggen.embedders.embedder.Embedder
TERMS_EMBEDDING_LAYER_NAME
trainable
summary
get_layer_weights
save_embedding
name
save_weights
load_weights
#   class Siamese(grape.embiggen.embedders.embedder.Embedder):
View Source
class Siamese(Embedder):
    """Siamese network for node-embedding including optionally node types and edge types."""

    NODE_TYPE_EMBEDDING_LAYER_NAME = "node_type_embedding_layer"
    EDGE_TYPE_EMBEDDING_LAYER_NAME = "edge_type_embedding_layer"

    def __init__(
        self,
        graph: Graph,
        use_node_types: Union[bool, str] = "auto",
        node_types_combination: str = "Add",
        use_edge_types: Union[bool, str] = "auto",
        node_embedding_size: int = 100,
        node_type_embedding_size: int = 100,
        edge_type_embedding_size: int = 100,
        distance_metric: str = "COSINE",
        relu_bias: float = 1.0,
        embedding: Optional[Union[np.ndarray, pd.DataFrame]] = None,
        extra_features: Optional[Union[np.ndarray, pd.DataFrame]] = None,
        model_name: str = "Siamese",
        optimizer: Union[str, Optimizer] = None,
        support_mirrored_strategy: bool = False,
        use_gradient_centralization: str = "auto"
    ):
        """Create new sequence Embedder model.

        Parameters
        -------------------------------------------
        vocabulary_size: int = None,
            Number of terms to embed.
            In a graph this is the number of nodes, while in a text is the
            number of the unique words.
            If None, the seed embedding must be provided.
            It is not possible to provide both at once.
        use_node_types: Union[bool, str] = "auto",
            Whether to use node type.
            By default, it will automatially use node types if the graph
            contains node type and does not contain any unknown node type.
        node_types_combination: str = "Add",
            Method to combine the node embedding with the node type ambedding.
            The supported methods are "Add" and "Concatenate".
        use_edge_types: Union[bool, str] = "auto",
            Whether to use edge type.
            By default, it will automatially use edge types if the graph
            contains edge type and does not contain any unknown edge type.
        node_embedding_size: int = 100,
            Dimension of the embedding.
            If None, the seed embedding must be provided.
            It is not possible to provide both at once.
        node_type_embedding_size: int = 100,
            Dimension of the embedding for the node types.
        edge_type_embedding_size: int = 100,
            Dimension of the embedding for the edge types.
        distance_metric: str = "COSINE",
            The distance to use for the loss function.
            Supported methods are L1, L2 and COSINE.
        relu_bias: float = 1.0,
            The bias to use for the ReLu.
        embedding: Union[np.ndarray, pd.DataFrame] = None,
            The seed embedding to be used.
            Note that it is not possible to provide at once both
            the embedding and either the vocabulary size or the embedding size.
        extra_features: Union[np.ndarray, pd.DataFrame] = None,
            Optional extra features to be used during the computation
            of the embedding. The features must be available for all the
            elements considered for the embedding.
        model_name: str = "Siamese",
            Name of the model.
        optimizer: Union[str, Optimizer] = "nadam",
            The optimizer to be used during the training of the model.
        support_mirrored_strategy: bool = False,
            Wethever to patch support for mirror strategy.
            At the time of writing, TensorFlow's MirrorStrategy does not support
            input values different from floats, therefore to support it we need
            to convert the unsigned int 32 values that represent the indices of
            the embedding layers we receive from Ensmallen to floats.
            This will generally slow down performance, but in the context of
            exploiting multiple GPUs it may be unnoticeable.
        use_gradient_centralization: bool = True,
            Whether to wrap the provided optimizer into a normalized
            one that centralizes the gradient.
            It is automatically enabled if the current version of
            TensorFlow supports gradient transformers.
            More detail here: https://arxiv.org/pdf/2004.01461.pdf
        """
        self._model_name = model_name
        if graph.has_disconnected_nodes():
            warnings.warn(
                "The graph contains disconnected nodes: these nodes will "
                "not be embedded in a semantically sensible way, but "
                "will only obtain a random node embedding vector which is "
                "far from all other nodes."
            )
        if use_node_types == "auto":
            use_node_types = graph.has_node_types() and not graph.has_unknown_node_types()
        if use_node_types:
            if not graph.has_node_types():
                raise ValueError(
                    "Node types are to be used to embed the given graph, "
                    "but the graph does not have node types"
                )
            if graph.has_unknown_node_types():
                raise ValueError(
                    "Node types are to be used to embed the given graph, "
                    "but the graph contains unknown node types and this "
                    "type of model is not designed in order to handle "
                    "unknown node types."
                )
            if graph.has_singleton_node_types():
                warnings.warn(
                    "The node types will be used in order to compute the node "
                    "embedding, but there are some singleton node types: these "
                    "node types will not capture any characteristic that is not "
                    "already captured by the node embedding, and may be an error "
                    "in the pipeline you have used to create this graph."
                )

            if graph.has_homogeneous_node_types():
                warnings.warn(
                    "The graph contains exclusively nodes with a homogenous "
                    "node type!"
                )

            self._multilabel_node_types = graph.has_multilabel_node_types()
            self._max_node_types = graph.get_maximum_multilabel_count()
            self._node_types_number = graph.get_node_types_number()
        if use_edge_types == "auto":
            use_edge_types = graph.has_edge_types() and not graph.has_unknown_edge_types()
        if use_edge_types:
            if not graph.has_edge_types():
                raise ValueError(
                    "Edge types are to be used to embed the given graph, "
                    "but the graph does not have edge types"
                )
            if graph.has_unknown_edge_types():
                raise ValueError(
                    "Edge types are to be used to embed the given graph, "
                    "but the graph contains unknown edge types and this "
                    "type of model is not designed in order to handle "
                    "unknown edge types."
                )
            if graph.has_singleton_edge_types():
                warnings.warn(
                    "The edge types will be used in order to compute the edge "
                    "embedding, but there are some singleton edge types: these "
                    "edge types will not capture any characteristic that is not "
                    "already captured by the edge embedding, and may be an error "
                    "in the pipeline you have used to create this graph."
                )

            if graph.has_homogeneous_edge_types():
                warnings.warn(
                    "The graph contains exclusively edges with a homogenous "
                    "edge type!"
                )

            self._edge_types_number = graph.get_edge_types_number()

        self._node_types_combination = node_types_combination
        self._use_node_types = use_node_types
        self._use_edge_types = use_edge_types
        self._node_type_embedding_size = node_type_embedding_size
        self._edge_type_embedding_size = edge_type_embedding_size
        self._graph = graph
        self._distance_metric = distance_metric
        self._relu_bias = relu_bias
        self._support_mirrored_strategy = support_mirrored_strategy

        super().__init__(
            vocabulary_size=graph.get_number_of_nodes(),
            embedding_size=node_embedding_size,
            embedding=embedding,
            extra_features=extra_features,
            optimizer=optimizer,
            use_gradient_centralization=use_gradient_centralization
        )

    def _build_model(self):
        """Return Node2Vec model."""
        # Creating the inputs layers
        input_layers = []
        source_nodes_input = Input((1,), name="source_nodes")
        input_layers.append(source_nodes_input)
        if self._use_node_types:
            source_node_types_input = Input(
                (self._max_node_types,), name="source_node_types")
            input_layers.append(source_node_types_input)

        destination_nodes_input = Input((1,), name="destination_nodes")
        input_layers.append(destination_nodes_input)
        if self._use_node_types:
            destination_node_types_input = Input(
                (self._max_node_types,), name="destination_node_types")
            input_layers.append(destination_node_types_input)

        if self._use_edge_types:
            edge_types_input = Input((1,), name="destination_edge_types")
            input_layers.append(edge_types_input)
        else:
            edge_types_input = None

        # Creating the embedding layer for the contexts
        node_embedding_layer = Embedding(
            input_dim=self._vocabulary_size,
            output_dim=self._embedding_size,
            input_length=1,
            name=Embedder.TERMS_EMBEDDING_LAYER_NAME
        )

        # Get the node embedding
        source_node_embedding = node_embedding_layer(source_nodes_input)
        destination_node_embedding = node_embedding_layer(
            destination_nodes_input
        )

        # Appling UnitNorm to them
        source_node_embedding = UnitNorm(axis=-1)(source_node_embedding)
        destination_node_embedding = UnitNorm(axis=-1)(destination_node_embedding)

        if self._use_node_types:
            node_type_embedding_layer = Embedding(
                input_dim=self._node_types_number +
                int(self._multilabel_node_types),
                output_dim=self._node_type_embedding_size,
                input_length=self._max_node_types,
                name=Siamese.NODE_TYPE_EMBEDDING_LAYER_NAME,
                mask_zero=self._multilabel_node_types
            )
            source_node_types_embedding = node_type_embedding_layer(
                source_node_types_input
            )
            destination_node_types_embedding = node_type_embedding_layer(
                destination_node_types_input
            )
            
            if self._multilabel_node_types:
                global_average_layer = GlobalAveragePooling1D()
                source_node_types_embedding = global_average_layer(
                    source_node_types_embedding
                )
                destination_node_types_embedding = global_average_layer(
                    destination_node_types_embedding
                )

            source_node_types_embedding = UnitNorm(axis=-1)(
                source_node_types_embedding
            )
            destination_node_types_embedding = UnitNorm(axis=-1)(
                destination_node_types_embedding
            )

            if self._node_types_combination == "Add":
                node_types_concatenation = Add()
            elif self._node_types_combination == "Concatenate":
                node_types_concatenation = Concatenate()
            else:
                raise ValueError(
                    "Supported node types concatenations are Dot, Add and Concatenate."
                )
            
            source_node_embedding = node_types_concatenation([
                source_node_embedding,
                source_node_types_embedding
            ])
            destination_node_embedding = node_types_concatenation([
                destination_node_embedding,
                destination_node_types_embedding
            ])

        if self._use_edge_types:
            edge_type_embedding = Embedding(
                input_dim=self._edge_types_number,
                output_dim=self._edge_type_embedding_size,
                input_length=1,
                name=Siamese.EDGE_TYPE_EMBEDDING_LAYER_NAME,
            )(edge_types_input)
            edge_type_embedding = UnitNorm(axis=-1)(edge_type_embedding)
        else:
            edge_type_embedding = None

        # Creating the actual model
        model = Model(
            inputs=input_layers,
            outputs=self._build_output(
                source_node_embedding,
                destination_node_embedding,
                edge_type_embedding,
                edge_types_input
            ),
            name=self._model_name
        )
        return model

    def _build_output(
        self,
        source_node_embedding: tf.Tensor,
        destination_node_embedding: tf.Tensor,
        *args: List[tf.Tensor],
        **kwargs: Dict[str, tf.Tensor],
    ):
        """Return output of the model.

        Parameters
        ----------------------
        source_node_embedding: tf.Tensor,
            Embedding of the source node.
        destination_node_embedding: tf.Tensor,
            Embedding of the destination node.
        args: List[tf.Tensor],
            Additional tensors that may be used
            in subclasses of this model.
        kwargs: Dict[str, tf.Tensor],
            Additional tensors that may be used
            in subclasses of this model.

        Returns
        ----------------------
        The distance for the Siamese network.
        """
        if self._distance_metric == "L1":
            return K.sum(
                source_node_embedding - destination_node_embedding,
                axis=-1
            )
        if self._distance_metric == "L2":
            return K.sum(K.square(source_node_embedding - destination_node_embedding), axis=-1)
        if self._distance_metric == "COSINE":
            return 1.0 - tf.losses.cosine_similarity(source_node_embedding, destination_node_embedding)
        raise ValueError(
            "Given distance metric {} is not supported.".format(self._distance_metric))

    def _siamese_loss(self, y_true: tf.Tensor, y_pred: tf.Tensor) -> float:
        """Compute the siamese loss function.

        Parameters
        ---------------------------
        y_true: tf.Tensor,
            The true values Tensor for this batch.
        y_pred: tf.Tensor,
            The predicted values Tensor for this batch.

        Returns
        ---------------------------
        Loss function score related to this batch.
        """
        # TODO: check what happens with and without relu cutoff
        y_true = tf.cast(y_true, "float32")
        return self._relu_bias + K.mean(
            (1 - 2 * y_true) * y_pred,
            axis=-1
        )

    def get_embedding_dataframe(self) -> List[pd.DataFrame]:
        """Return terms embedding using given index names."""
        values = [
            pd.DataFrame(
                self.get_layer_weights(Embedder.TERMS_EMBEDDING_LAYER_NAME),
                index=self._graph.get_node_names(),
            ),
        ]
        if self._use_node_types:
            node_types_embedding = self.get_layer_weights(
                Siamese.NODE_TYPE_EMBEDDING_LAYER_NAME
            )
            if self._multilabel_node_types:
                node_types_embedding = node_types_embedding[1:]
            values.append(
                pd.DataFrame(
                    node_types_embedding,
                    index=self._graph.get_unique_node_type_names(),
                ),
            )
        if self._use_edge_types:
            try:
                values.append(
                    pd.DataFrame(
                        self.get_layer_weights(
                            Siamese.EDGE_TYPE_EMBEDDING_LAYER_NAME
                        ),
                        index=self._graph.get_unique_edge_type_names(),
                    ),
                )
            except NotImplementedError:
                pass

        return values

    def _compile_model(self) -> Model:
        """Compile model."""
        self._model.compile(
            loss=self._siamese_loss,
            optimizer=self._optimizer
        )

    @property
    def embedding(self) -> np.ndarray:
        """Return model embeddings.

        Raises
        -------------------
        NotImplementedError,
            If the current embedding model does not have an embedding layer.
        """
        # TODO create multiple getters for the various embedding layers.
        return Embedder.embedding.fget(self)  # pylint: disable=no-member

    def fit(
        self,
        batch_size: int = 2**20,
        negative_samples_rate: float = 0.5,
        avoid_false_negatives: bool = False,
        graph_to_avoid: Graph = None,
        batches_per_epoch: Union[int, str] = "auto",
        elapsed_epochs: int = 0,
        epochs: int = 1000,
        early_stopping_monitor: str = "loss",
        early_stopping_min_delta: float = 0.01,
        early_stopping_patience: int = 5,
        early_stopping_mode: str = "min",
        reduce_lr_monitor: str = "loss",
        reduce_lr_min_delta: float = 0.01,
        reduce_lr_patience: int = 2,
        reduce_lr_mode: str = "min",
        reduce_lr_factor: float = 0.9,
        verbose: int = 2,
        **kwargs: Dict
    ) -> pd.DataFrame:
        """Return pandas dataframe with training history.

        Parameters
        -----------------------
        graph: Graph,
            Graph to embed.
        epochs: int = 100,
            Epochs to train the model for.
        early_stopping_monitor: str = "loss",
            Metric to monitor for early stopping.
        early_stopping_min_delta: float = 0.1,
            Minimum delta of metric to stop the training.
        early_stopping_patience: int = 5,
            Number of epochs to wait for when the given minimum delta is not
            achieved after which trigger early stopping.
        early_stopping_mode: str = "min",
            Direction of the variation of the monitored metric for early stopping.
        reduce_lr_monitor: str = "loss",
            Metric to monitor for reducing learning rate.
        reduce_lr_min_delta: float = 1,
            Minimum delta of metric to reduce learning rate.
        reduce_lr_patience: int = 3,
            Number of epochs to wait for when the given minimum delta is not
            achieved after which reducing learning rate.
        reduce_lr_mode: str = "min",
            Direction of the variation of the monitored metric for learning rate.
        reduce_lr_factor: float = 0.9,
            Factor for reduction of learning rate.
        verbose: int = 2,
            Wethever to show the loading bar.
            Specifically, the options are:
            * 0 or False: No loading bar.
            * 1 or True: Showing only the loading bar for the epochs.
            * 2: Showing loading bar for both epochs and batches.
        **kwargs: Dict,
            Additional kwargs to pass to the Keras fit call.

        Returns
        -----------------------
        Dataframe with training history.
        """
        sequence = EdgePredictionSequence(
            graph=self._graph,
            batch_size=batch_size,
            avoid_false_negatives=avoid_false_negatives,
            support_mirrored_strategy=self._support_mirrored_strategy,
            graph_to_avoid=graph_to_avoid,
            use_node_types=self._use_node_types,
            use_edge_types=self._use_edge_types,
            negative_samples_rate=negative_samples_rate,
            elapsed_epochs=elapsed_epochs,
            batches_per_epoch=batches_per_epoch
        )
        return super().fit(
            sequence,
            epochs=epochs,
            early_stopping_monitor=early_stopping_monitor,
            early_stopping_min_delta=early_stopping_min_delta,
            early_stopping_patience=early_stopping_patience,
            early_stopping_mode=early_stopping_mode,
            reduce_lr_monitor=reduce_lr_monitor,
            reduce_lr_min_delta=reduce_lr_min_delta,
            reduce_lr_patience=reduce_lr_patience,
            reduce_lr_mode=reduce_lr_mode,
            reduce_lr_factor=reduce_lr_factor,
            verbose=verbose,
            **kwargs
        )

Siamese network for node-embedding including optionally node types and edge types.

#   Siamese( graph: Graph, use_node_types: Union[bool, str] = 'auto', node_types_combination: str = 'Add', use_edge_types: Union[bool, str] = 'auto', node_embedding_size: int = 100, node_type_embedding_size: int = 100, edge_type_embedding_size: int = 100, distance_metric: str = 'COSINE', relu_bias: float = 1.0, embedding: Union[numpy.ndarray, pandas.core.frame.DataFrame, NoneType] = None, extra_features: Union[numpy.ndarray, pandas.core.frame.DataFrame, NoneType] = None, model_name: str = 'Siamese', optimizer: Union[str, keras.optimizer_v2.optimizer_v2.OptimizerV2] = None, support_mirrored_strategy: bool = False, use_gradient_centralization: str = 'auto' )
View Source
    def __init__(
        self,
        graph: Graph,
        use_node_types: Union[bool, str] = "auto",
        node_types_combination: str = "Add",
        use_edge_types: Union[bool, str] = "auto",
        node_embedding_size: int = 100,
        node_type_embedding_size: int = 100,
        edge_type_embedding_size: int = 100,
        distance_metric: str = "COSINE",
        relu_bias: float = 1.0,
        embedding: Optional[Union[np.ndarray, pd.DataFrame]] = None,
        extra_features: Optional[Union[np.ndarray, pd.DataFrame]] = None,
        model_name: str = "Siamese",
        optimizer: Union[str, Optimizer] = None,
        support_mirrored_strategy: bool = False,
        use_gradient_centralization: str = "auto"
    ):
        """Create new sequence Embedder model.

        Parameters
        -------------------------------------------
        vocabulary_size: int = None,
            Number of terms to embed.
            In a graph this is the number of nodes, while in a text is the
            number of the unique words.
            If None, the seed embedding must be provided.
            It is not possible to provide both at once.
        use_node_types: Union[bool, str] = "auto",
            Whether to use node type.
            By default, it will automatially use node types if the graph
            contains node type and does not contain any unknown node type.
        node_types_combination: str = "Add",
            Method to combine the node embedding with the node type ambedding.
            The supported methods are "Add" and "Concatenate".
        use_edge_types: Union[bool, str] = "auto",
            Whether to use edge type.
            By default, it will automatially use edge types if the graph
            contains edge type and does not contain any unknown edge type.
        node_embedding_size: int = 100,
            Dimension of the embedding.
            If None, the seed embedding must be provided.
            It is not possible to provide both at once.
        node_type_embedding_size: int = 100,
            Dimension of the embedding for the node types.
        edge_type_embedding_size: int = 100,
            Dimension of the embedding for the edge types.
        distance_metric: str = "COSINE",
            The distance to use for the loss function.
            Supported methods are L1, L2 and COSINE.
        relu_bias: float = 1.0,
            The bias to use for the ReLu.
        embedding: Union[np.ndarray, pd.DataFrame] = None,
            The seed embedding to be used.
            Note that it is not possible to provide at once both
            the embedding and either the vocabulary size or the embedding size.
        extra_features: Union[np.ndarray, pd.DataFrame] = None,
            Optional extra features to be used during the computation
            of the embedding. The features must be available for all the
            elements considered for the embedding.
        model_name: str = "Siamese",
            Name of the model.
        optimizer: Union[str, Optimizer] = "nadam",
            The optimizer to be used during the training of the model.
        support_mirrored_strategy: bool = False,
            Wethever to patch support for mirror strategy.
            At the time of writing, TensorFlow's MirrorStrategy does not support
            input values different from floats, therefore to support it we need
            to convert the unsigned int 32 values that represent the indices of
            the embedding layers we receive from Ensmallen to floats.
            This will generally slow down performance, but in the context of
            exploiting multiple GPUs it may be unnoticeable.
        use_gradient_centralization: bool = True,
            Whether to wrap the provided optimizer into a normalized
            one that centralizes the gradient.
            It is automatically enabled if the current version of
            TensorFlow supports gradient transformers.
            More detail here: https://arxiv.org/pdf/2004.01461.pdf
        """
        self._model_name = model_name
        if graph.has_disconnected_nodes():
            warnings.warn(
                "The graph contains disconnected nodes: these nodes will "
                "not be embedded in a semantically sensible way, but "
                "will only obtain a random node embedding vector which is "
                "far from all other nodes."
            )
        if use_node_types == "auto":
            use_node_types = graph.has_node_types() and not graph.has_unknown_node_types()
        if use_node_types:
            if not graph.has_node_types():
                raise ValueError(
                    "Node types are to be used to embed the given graph, "
                    "but the graph does not have node types"
                )
            if graph.has_unknown_node_types():
                raise ValueError(
                    "Node types are to be used to embed the given graph, "
                    "but the graph contains unknown node types and this "
                    "type of model is not designed in order to handle "
                    "unknown node types."
                )
            if graph.has_singleton_node_types():
                warnings.warn(
                    "The node types will be used in order to compute the node "
                    "embedding, but there are some singleton node types: these "
                    "node types will not capture any characteristic that is not "
                    "already captured by the node embedding, and may be an error "
                    "in the pipeline you have used to create this graph."
                )

            if graph.has_homogeneous_node_types():
                warnings.warn(
                    "The graph contains exclusively nodes with a homogenous "
                    "node type!"
                )

            self._multilabel_node_types = graph.has_multilabel_node_types()
            self._max_node_types = graph.get_maximum_multilabel_count()
            self._node_types_number = graph.get_node_types_number()
        if use_edge_types == "auto":
            use_edge_types = graph.has_edge_types() and not graph.has_unknown_edge_types()
        if use_edge_types:
            if not graph.has_edge_types():
                raise ValueError(
                    "Edge types are to be used to embed the given graph, "
                    "but the graph does not have edge types"
                )
            if graph.has_unknown_edge_types():
                raise ValueError(
                    "Edge types are to be used to embed the given graph, "
                    "but the graph contains unknown edge types and this "
                    "type of model is not designed in order to handle "
                    "unknown edge types."
                )
            if graph.has_singleton_edge_types():
                warnings.warn(
                    "The edge types will be used in order to compute the edge "
                    "embedding, but there are some singleton edge types: these "
                    "edge types will not capture any characteristic that is not "
                    "already captured by the edge embedding, and may be an error "
                    "in the pipeline you have used to create this graph."
                )

            if graph.has_homogeneous_edge_types():
                warnings.warn(
                    "The graph contains exclusively edges with a homogenous "
                    "edge type!"
                )

            self._edge_types_number = graph.get_edge_types_number()

        self._node_types_combination = node_types_combination
        self._use_node_types = use_node_types
        self._use_edge_types = use_edge_types
        self._node_type_embedding_size = node_type_embedding_size
        self._edge_type_embedding_size = edge_type_embedding_size
        self._graph = graph
        self._distance_metric = distance_metric
        self._relu_bias = relu_bias
        self._support_mirrored_strategy = support_mirrored_strategy

        super().__init__(
            vocabulary_size=graph.get_number_of_nodes(),
            embedding_size=node_embedding_size,
            embedding=embedding,
            extra_features=extra_features,
            optimizer=optimizer,
            use_gradient_centralization=use_gradient_centralization
        )

Create new sequence Embedder model.

Parameters
  • vocabulary_size (int = None,): Number of terms to embed. In a graph this is the number of nodes, while in a text is the number of the unique words. If None, the seed embedding must be provided. It is not possible to provide both at once.
  • use_node_types (Union[bool, str] = "auto",): Whether to use node type. By default, it will automatially use node types if the graph contains node type and does not contain any unknown node type.
  • node_types_combination (str = "Add",): Method to combine the node embedding with the node type ambedding. The supported methods are "Add" and "Concatenate".
  • use_edge_types (Union[bool, str] = "auto",): Whether to use edge type. By default, it will automatially use edge types if the graph contains edge type and does not contain any unknown edge type.
  • node_embedding_size (int = 100,): Dimension of the embedding. If None, the seed embedding must be provided. It is not possible to provide both at once.
  • node_type_embedding_size (int = 100,): Dimension of the embedding for the node types.
  • edge_type_embedding_size (int = 100,): Dimension of the embedding for the edge types.
  • distance_metric (str = "COSINE",): The distance to use for the loss function. Supported methods are L1, L2 and COSINE.
  • relu_bias (float = 1.0,): The bias to use for the ReLu.
  • embedding (Union[np.ndarray, pd.DataFrame] = None,): The seed embedding to be used. Note that it is not possible to provide at once both the embedding and either the vocabulary size or the embedding size.
  • extra_features (Union[np.ndarray, pd.DataFrame] = None,): Optional extra features to be used during the computation of the embedding. The features must be available for all the elements considered for the embedding.
  • model_name (str = "Siamese",): Name of the model.
  • optimizer (Union[str, Optimizer] = "nadam",): The optimizer to be used during the training of the model.
  • support_mirrored_strategy (bool = False,): Wethever to patch support for mirror strategy. At the time of writing, TensorFlow's MirrorStrategy does not support input values different from floats, therefore to support it we need to convert the unsigned int 32 values that represent the indices of the embedding layers we receive from Ensmallen to floats. This will generally slow down performance, but in the context of exploiting multiple GPUs it may be unnoticeable.
  • use_gradient_centralization (bool = True,): Whether to wrap the provided optimizer into a normalized one that centralizes the gradient. It is automatically enabled if the current version of TensorFlow supports gradient transformers. More detail here: https://arxiv.org/pdf/2004.01461.pdf
#   NODE_TYPE_EMBEDDING_LAYER_NAME = 'node_type_embedding_layer'
#   EDGE_TYPE_EMBEDDING_LAYER_NAME = 'edge_type_embedding_layer'
#   def get_embedding_dataframe(self) -> List[pandas.core.frame.DataFrame]:
View Source
    def get_embedding_dataframe(self) -> List[pd.DataFrame]:
        """Return terms embedding using given index names."""
        values = [
            pd.DataFrame(
                self.get_layer_weights(Embedder.TERMS_EMBEDDING_LAYER_NAME),
                index=self._graph.get_node_names(),
            ),
        ]
        if self._use_node_types:
            node_types_embedding = self.get_layer_weights(
                Siamese.NODE_TYPE_EMBEDDING_LAYER_NAME
            )
            if self._multilabel_node_types:
                node_types_embedding = node_types_embedding[1:]
            values.append(
                pd.DataFrame(
                    node_types_embedding,
                    index=self._graph.get_unique_node_type_names(),
                ),
            )
        if self._use_edge_types:
            try:
                values.append(
                    pd.DataFrame(
                        self.get_layer_weights(
                            Siamese.EDGE_TYPE_EMBEDDING_LAYER_NAME
                        ),
                        index=self._graph.get_unique_edge_type_names(),
                    ),
                )
            except NotImplementedError:
                pass

        return values

Return terms embedding using given index names.

#   embedding: numpy.ndarray

Return model embeddings.

Raises
  • NotImplementedError,: If the current embedding model does not have an embedding layer.
#   def fit( self, batch_size: int = 1048576, negative_samples_rate: float = 0.5, avoid_false_negatives: bool = False, graph_to_avoid: Graph = None, batches_per_epoch: Union[int, str] = 'auto', elapsed_epochs: int = 0, epochs: int = 1000, early_stopping_monitor: str = 'loss', early_stopping_min_delta: float = 0.01, early_stopping_patience: int = 5, early_stopping_mode: str = 'min', reduce_lr_monitor: str = 'loss', reduce_lr_min_delta: float = 0.01, reduce_lr_patience: int = 2, reduce_lr_mode: str = 'min', reduce_lr_factor: float = 0.9, verbose: int = 2, **kwargs: Dict ) -> pandas.core.frame.DataFrame:
View Source
    def fit(
        self,
        batch_size: int = 2**20,
        negative_samples_rate: float = 0.5,
        avoid_false_negatives: bool = False,
        graph_to_avoid: Graph = None,
        batches_per_epoch: Union[int, str] = "auto",
        elapsed_epochs: int = 0,
        epochs: int = 1000,
        early_stopping_monitor: str = "loss",
        early_stopping_min_delta: float = 0.01,
        early_stopping_patience: int = 5,
        early_stopping_mode: str = "min",
        reduce_lr_monitor: str = "loss",
        reduce_lr_min_delta: float = 0.01,
        reduce_lr_patience: int = 2,
        reduce_lr_mode: str = "min",
        reduce_lr_factor: float = 0.9,
        verbose: int = 2,
        **kwargs: Dict
    ) -> pd.DataFrame:
        """Return pandas dataframe with training history.

        Parameters
        -----------------------
        graph: Graph,
            Graph to embed.
        epochs: int = 100,
            Epochs to train the model for.
        early_stopping_monitor: str = "loss",
            Metric to monitor for early stopping.
        early_stopping_min_delta: float = 0.1,
            Minimum delta of metric to stop the training.
        early_stopping_patience: int = 5,
            Number of epochs to wait for when the given minimum delta is not
            achieved after which trigger early stopping.
        early_stopping_mode: str = "min",
            Direction of the variation of the monitored metric for early stopping.
        reduce_lr_monitor: str = "loss",
            Metric to monitor for reducing learning rate.
        reduce_lr_min_delta: float = 1,
            Minimum delta of metric to reduce learning rate.
        reduce_lr_patience: int = 3,
            Number of epochs to wait for when the given minimum delta is not
            achieved after which reducing learning rate.
        reduce_lr_mode: str = "min",
            Direction of the variation of the monitored metric for learning rate.
        reduce_lr_factor: float = 0.9,
            Factor for reduction of learning rate.
        verbose: int = 2,
            Wethever to show the loading bar.
            Specifically, the options are:
            * 0 or False: No loading bar.
            * 1 or True: Showing only the loading bar for the epochs.
            * 2: Showing loading bar for both epochs and batches.
        **kwargs: Dict,
            Additional kwargs to pass to the Keras fit call.

        Returns
        -----------------------
        Dataframe with training history.
        """
        sequence = EdgePredictionSequence(
            graph=self._graph,
            batch_size=batch_size,
            avoid_false_negatives=avoid_false_negatives,
            support_mirrored_strategy=self._support_mirrored_strategy,
            graph_to_avoid=graph_to_avoid,
            use_node_types=self._use_node_types,
            use_edge_types=self._use_edge_types,
            negative_samples_rate=negative_samples_rate,
            elapsed_epochs=elapsed_epochs,
            batches_per_epoch=batches_per_epoch
        )
        return super().fit(
            sequence,
            epochs=epochs,
            early_stopping_monitor=early_stopping_monitor,
            early_stopping_min_delta=early_stopping_min_delta,
            early_stopping_patience=early_stopping_patience,
            early_stopping_mode=early_stopping_mode,
            reduce_lr_monitor=reduce_lr_monitor,
            reduce_lr_min_delta=reduce_lr_min_delta,
            reduce_lr_patience=reduce_lr_patience,
            reduce_lr_mode=reduce_lr_mode,
            reduce_lr_factor=reduce_lr_factor,
            verbose=verbose,
            **kwargs
        )

Return pandas dataframe with training history.

Parameters
  • graph (Graph,): Graph to embed.
  • epochs (int = 100,): Epochs to train the model for.
  • early_stopping_monitor (str = "loss",): Metric to monitor for early stopping.
  • early_stopping_min_delta (float = 0.1,): Minimum delta of metric to stop the training.
  • early_stopping_patience (int = 5,): Number of epochs to wait for when the given minimum delta is not achieved after which trigger early stopping.
  • early_stopping_mode (str = "min",): Direction of the variation of the monitored metric for early stopping.
  • reduce_lr_monitor (str = "loss",): Metric to monitor for reducing learning rate.
  • reduce_lr_min_delta (float = 1,): Minimum delta of metric to reduce learning rate.
  • reduce_lr_patience (int = 3,): Number of epochs to wait for when the given minimum delta is not achieved after which reducing learning rate.
  • reduce_lr_mode (str = "min",): Direction of the variation of the monitored metric for learning rate.
  • reduce_lr_factor (float = 0.9,): Factor for reduction of learning rate.
  • verbose (int = 2,): Wethever to show the loading bar. Specifically, the options are:
    • 0 or False: No loading bar.
    • 1 or True: Showing only the loading bar for the epochs.
    • 2: Showing loading bar for both epochs and batches.
  • **kwargs (Dict,): Additional kwargs to pass to the Keras fit call.
Returns
  • Dataframe with training history.
Inherited Members
grape.embiggen.embedders.embedder.Embedder
TERMS_EMBEDDING_LAYER_NAME
trainable
summary
get_layer_weights
save_embedding
name
save_weights
load_weights
#   class GraphConvolutionalNeuralNetwork:
View Source
class GraphConvolutionalNeuralNetwork:
    """Graph Convolutional Neural Network (GCNN) model for graph embedding."""

    def __init__(
        self,
        graph: Graph,
        use_weights: Union[str, bool] = "auto",
        use_class_weights: bool = True,
        node_features_number: Optional[int] = None,
        node_features: Optional[pd.DataFrame] = None,
        number_of_hidden_layers: int = 1,
        number_of_units_per_hidden_layer: Union[int, List[int]] = 16,
        use_dense_hidden_layers: bool = False,
        activations_per_hidden_layer: Union[str, List[str]] = "relu",
        kernel_initializer: Union[str, Initializer] = 'glorot_uniform',
        bias_initializer: Union[str, Initializer] = 'zeros',
        kernel_regularizer: Union[str, Regularizer] = None,
        bias_regularizer: Union[str, Regularizer] = None,
        activity_regularizer: Union[str, Regularizer] = None,
        kernel_constraint: Union[str, Constraint] = None,
        bias_constraint: Union[str, Constraint] = None,
        features_dropout_rate: float = 0.5,
        optimizer: Union[str, Optimizer] = "nadam",
    ):
        """Create new GloVe-based Embedder object.

        Parameters
        -------------------------------
        graph: Graph,
            The data for which to build the model.
        use_class_weights: bool = True,
            Whether to use class weights to rebalance the loss relative to unbalanced classes.
            Learn more about class weights here: https://www.tensorflow.org/tutorials/structured_data/imbalanced_data
        number_of_units_per_hidden_layer: Union[int, List[int]] = 16,
            Number of units per hidden layer.
        use_dense_hidden_layers: bool = False,
            Whether to use dense layer for the hidden layers.
            It is useful in the context of a shallow GCN, when it is not
            feaseable to use the batch size equal to the number of nodes in the graph.
        use_weights: Union[str, bool] = "auto",
            Whether to expect weights in input to execute the graph convolution.
            The weights may be used in order to compute for instance a weighting
            using the symmetric normalized Laplacian method.
        number_of_nodes: Optional[int] = None,
            Number of nodes in the considered.
            If the node features are provided, the nodes number is extracted by the node features.
        node_features_number: Optional[int] = None,
            Number of node features.
            If the node features are provided, the features number is extracted by the node features.
        node_features: Optional[Union[np.ndarray, pd.DataFrame]] = None,
            Vector with the provided node features.
        trainable: Union[str, bool] = "auto",
            Whether to make the node features trainable.
            By default, with "auto", the embedding is trainable if no node features where provided.
        number_of_hidden_layers: int = 1,
            Number of graph convolution layer.
        number_of_units_per_hidden_layer: Union[int, List[int]] = 16,
            Number of units per hidden layer.
        kernel_initializer: Union[str, Initializer] = 'glorot_uniform',
            Initializer for the kernel weights matrix.
        bias_initializer: Union[str, Initializer] = 'zeros',
            Initializer for the bias vector.
        kernel_regularizer: Union[str, Regularizer] = None,
            Regularizer function applied to the kernel weights matrix.
        bias_regularizer: Union[str, Regularizer] = None,
            Regularizer function applied to the bias vector.
        activity_regularizer: Union[str, Regularizer] = None,
            Regularizer function applied to the output of the activation function.
        kernel_constraint: Union[str, Constraint] = None,
            Constraint function applied to the kernel matrix.
        bias_constraint: Union[str, Constraint] = None,
            Constraint function applied to the bias vector.
        features_dropout_rate: float = 0.5,
            Float between 0 and 1. Fraction of the input units to drop.

        """
        if isinstance(number_of_units_per_hidden_layer, int):
            number_of_units_per_hidden_layer = [
                number_of_units_per_hidden_layer
            ] * number_of_hidden_layers

        if isinstance(activations_per_hidden_layer, str):
            activations_per_hidden_layer = [
                activations_per_hidden_layer
            ] * number_of_hidden_layers

        if len(number_of_units_per_hidden_layer) != number_of_hidden_layers:
            raise ValueError(
                "The number of hidden layers must match"
                "the number of the hidden units per layer provided"
            )

        if len(activations_per_hidden_layer) != number_of_hidden_layers:
            raise ValueError(
                "The number of hidden layers must match"
                "the number of the activations per layer provided"
            )

        use_weights_supported_values = ("auto", True, False)
        if use_weights not in use_weights_supported_values:
            raise ValueError(
                (
                    "The provided value for `use_weights`, '{}', is not among the supported values '{}'."
                ).format(use_weights, use_weights_supported_values)
            )
        if node_features is not None and any(node_features.index != graph.get_node_names()):
            raise ValueError(
                "The provided node features DataFrame is not aligned with the "
                "provided graph nodes."
            )
        if node_features_number is None and node_features is None:
            raise ValueError(
                "Eiter the number of node features or the node features "
                "themselves must be provided."
            )
        if use_weights == "auto":
            use_weights = graph.has_edge_weights()
        if node_features is not None:
            node_features_number = node_features.shape[-1]
        self._use_weights = use_weights
        self._use_class_weights = use_class_weights
        self._node_features_number = node_features_number
        self._node_features = node_features
        self._nodes_number = graph.get_number_of_nodes()
        self._node_types_number = graph.get_node_types_number()
        number_of_units_per_hidden_layer[-1] = self._node_types_number
        self._number_of_hidden_layers = number_of_hidden_layers
        self._use_dense_hidden_layers = use_dense_hidden_layers
        self._kernel_initializer = kernel_initializer
        self._bias_initializer = bias_initializer
        self._kernel_regularizer = kernel_regularizer
        self._bias_regularizer = bias_regularizer
        self._activity_regularizer = activity_regularizer
        self._kernel_constraint = kernel_constraint
        self._bias_constraint = bias_constraint
        self._features_dropout_rate = features_dropout_rate
        self._number_of_units_per_hidden_layer = number_of_units_per_hidden_layer
        self._multi_label = graph.has_multilabel_node_types()
        activations_per_hidden_layer[-1] = "sigmoid" if self._multi_label or self._node_types_number == 1 else "softmax"
        self._activations_per_hidden_layer = activations_per_hidden_layer
        self._optimizer = optimizer
        self._adjacency_matrix = graph_to_sparse_tensor(
            graph,
            use_weights=self._use_weights
        )
        self._model = self._build_model()
        self._compile_model()

    def _build_model(self):
        """Create new GCN model."""
        adjacency_matrix = Input(
            shape=(self._nodes_number,),
            sparse=True
        )

        input_graph_convolution = GraphConvolution(
            self._number_of_units_per_hidden_layer[0],
            activation=self._activations_per_hidden_layer[0],
            features_dropout_rate=self._features_dropout_rate,
            kernel_initializer=self._kernel_initializer,
            bias_initializer=self._bias_initializer,
            kernel_regularizer=self._kernel_regularizer,
            bias_regularizer=self._bias_regularizer,
            activity_regularizer=self._activity_regularizer,
            kernel_constraint=self._kernel_constraint,
            bias_constraint=self._bias_constraint,
        )

        input_graph_convolution.build(self._node_features_number)

        if self._node_features is None:
            node_features = input_graph_convolution.add_weight(
                name="node_features",
                shape=(self._nodes_number, self._node_features_number),
                trainable=True,
                initializer="glorot_normal",
                dtype=tf.float32
            )
        else:
            node_features = tf.Variable(
                initial_value=self._node_features.values,
                trainable=False,
                validate_shape=True,
                name="node_features",
                shape=(self._nodes_number, self._node_features_number),
                dtype=tf.float32
            )

        hidden = input_graph_convolution(adjacency_matrix, node_features)
        for i in range(1, self._number_of_hidden_layers):
            kwargs = dict(
                units=self._number_of_units_per_hidden_layer[i],
                activation=self._activations_per_hidden_layer[i],
                kernel_initializer=self._kernel_initializer,
                bias_initializer=self._bias_initializer,
                kernel_regularizer=self._kernel_regularizer,
                bias_regularizer=self._bias_regularizer,
                activity_regularizer=self._activity_regularizer,
                kernel_constraint=self._kernel_constraint,
                bias_constraint=self._bias_constraint
            )
            if self._use_dense_hidden_layers:
                hidden = Dense(**kwargs)(hidden)
            else:
                gcn_hidden = GraphConvolution(
                    features_dropout_rate=self._features_dropout_rate,
                    **kwargs,
                )

                gcn_hidden.build(self._number_of_units_per_hidden_layer[i-1])

                hidden = gcn_hidden(adjacency_matrix, hidden)

        return Model(
            inputs=adjacency_matrix,
            outputs=hidden,
            name="GCN"
        )

    def _compile_model(self) -> Model:
        """Compile model."""
        self._model.compile(
            loss='binary_crossentropy' if self._node_types_number == 1 or self._multi_label else "categorical_crossentropy",
            optimizer=self._optimizer,
            weighted_metrics=get_minimal_multiclass_metrics()
        )

    @property
    def name(self) -> str:
        return self._model.name

    def summary(self):
        """Print model summary."""
        self._model.summary()

    def run_batch_size_check(self, batch_size: Union[int, str]) -> int:
        """Runs check for valid batch size given the model.

        Parameters
        -----------------
        batch_size: int,
            The batch size to check.

        Raises
        -----------------
        ValueError,
            If the given batch size is not compatible with the
            current model.

        Returns
        -----------------
        Validated batch size.
        """
        if batch_size == "auto":
            return self._nodes_number

        if self._number_of_hidden_layers != 1 and not self._use_dense_hidden_layers and batch_size != self._nodes_number:
            raise ValueError(
                "If the number of GCN layers is greater than 1, "
                "the batch size must be equal to the number of "
                "nodes in the graph.\n"
                "Alternatively you can use Dense layers for the "
                "hidden layers."
            )

        return batch_size

    def fit(
        self,
        train_graph: Graph,
        batch_size: Union[int, str] = "auto",
        validation_freq: int = 1,
        early_stopping_min_delta: float = 0.001,
        early_stopping_patience: int = 10,
        reduce_lr_min_delta: float = 0.001,
        reduce_lr_patience: int = 5,
        validation_graph: Graph = None,
        epochs: int = 1000,
        early_stopping_monitor: str = "loss",
        early_stopping_mode: str = "min",
        reduce_lr_monitor: str = "loss",
        reduce_lr_mode: str = "min",
        reduce_lr_factor: float = 0.9,
        verbose: int = 2,
        **kwargs: Dict
    ) -> pd.DataFrame:
        """Return pandas dataframe with training history.

        Parameters
        -----------------------
        train_graph: Graph,
            Graph to use for the training.
        batch_size: Union[int, str] = "auto",
            Batch size for the training epochs.
            If the model has a single GCN layer it is possible
            to specify a variable batch size.
        validation_freq: int = 1,
            The frequency when to run the validation.
            Note that in sparse tensors, this step is apparently
            extremely slow and seems to be happening in GPU.
            You may want to increase it to a higher value than one.
        early_stopping_min_delta: float,
            Minimum delta of metric to stop the training.
        early_stopping_patience: int,
            Number of epochs to wait for when the given minimum delta is not
            achieved after which trigger early stopping.
        reduce_lr_min_delta: float,
            Minimum delta of metric to reduce learning rate.
        reduce_lr_patience: int,
            Number of epochs to wait for when the given minimum delta is not
            achieved after which reducing learning rate.
        validation_graph: Graph = None,
            Tuple to use for the validation.
        epochs: int = 10000,
            Epochs to train the model for.
        early_stopping_monitor: str = "loss",
            Metric to monitor for early stopping.
        early_stopping_mode: str = "min",
            Direction of the variation of the monitored metric for early stopping.
        reduce_lr_monitor: str = "loss",
            Metric to monitor for reducing learning rate.
        reduce_lr_mode: str = "min",
            Direction of the variation of the monitored metric for learning rate.
        reduce_lr_factor: float = 0.9,
            Factor for reduction of learning rate.
        verbose: int = 1,
            Wethever to show the loading bar.
            Specifically, the options are:
            * 0 or False: No loading bar.
            * 1 or True: Showing only the loading bar for the epochs.
            * 2: Showing loading bar for both epochs and batches.
        **kwargs: Dict,
            Additional kwargs to pass to the Keras fit call.

        Raises
        -----------------------
        ValueError,
            If given verbose value is not within the available set (-1, 0, 1).

        Returns
        -----------------------
        Dataframe with training history.
        """
        try:
            from tqdm.keras import TqdmCallback
            traditional_verbose = False
        except AttributeError:
            traditional_verbose = True
        verbose = validate_verbose(verbose)

        if validation_graph:
            validation_data = (
                self._adjacency_matrix,
                validation_graph.get_one_hot_encoded_node_types().astype(float),
                # This is a known hack to get around limitations from the current
                # implementation that handles the sample weights in TensorFlow.
                pd.Series(validation_graph.get_known_node_types_mask().astype(float))
            )
        else:
            validation_data = None

        if self._use_class_weights:
            class_weight = {
                node_type_id: self._nodes_number / count / self._node_types_number
                for node_type_id, count in train_graph.get_node_type_id_counts_hashmap().items()
            }
        else:
            class_weight = None

        callbacks = kwargs.pop("callbacks", ())
        return pd.DataFrame(self._model.fit(
            self._adjacency_matrix, train_graph.get_one_hot_encoded_node_types().astype(float),
            # This is a known hack to get around limitations from the current
            # implementation that handles the sample weights in TensorFlow.
            sample_weight=pd.Series(train_graph.get_known_node_types_mask().astype(float)),
            validation_data=validation_data,
            epochs=epochs,
            verbose=traditional_verbose and verbose > 0,
            batch_size=self.run_batch_size_check(batch_size),
            validation_freq=validation_freq,
            class_weight=class_weight,
            callbacks=[
                EarlyStopping(
                    monitor=early_stopping_monitor,
                    min_delta=early_stopping_min_delta,
                    patience=early_stopping_patience,
                    mode=early_stopping_mode,
                ),
                ReduceLROnPlateau(
                    monitor=reduce_lr_monitor,
                    min_delta=reduce_lr_min_delta,
                    patience=reduce_lr_patience,
                    factor=reduce_lr_factor,
                    mode=reduce_lr_mode,
                ),
                *((TqdmCallback(verbose=verbose-1),)
                  if not traditional_verbose and verbose > 0 else ()),
                *callbacks
            ],
            **kwargs
        ).history)

    def predict(
        self,
        graph: Graph,
        *args: List,
        batch_size: Union[int, str] = "auto",
        **kwargs: Dict
    ) -> pd.DataFrame:
        """Run predictions on the provided graph."""
        predictions = self._model.predict(
            self._adjacency_matrix,
            *args,
            batch_size=self.run_batch_size_check(batch_size),
            **kwargs
        )
        return pd.DataFrame(
            predictions,
            columns=graph.get_unique_node_type_names(),
            index=graph.get_node_names()
        )

    def evaluate(
        self,
        graph: Graph,
        *args: List,
        batch_size: Union[int, str] = "auto",
        **kwargs: Dict
    ) -> Dict[str, float]:
        """Run evaluation of the model over the provided graph."""
        return dict(zip(
            self._model.metrics_names,
            self._model.evaluate(
                self._adjacency_matrix,
                graph.get_one_hot_encoded_node_types(),
                * args,
                # This is a known hack to get around limitations from the current
                # implementation that handles the sample weights in TensorFlow.
                sample_weight=pd.Series(
                    graph.get_known_node_types_mask().astype(float)),
                batch_size=self.run_batch_size_check(batch_size),
                **kwargs
            )
        ))

Graph Convolutional Neural Network (GCNN) model for graph embedding.

#   GraphConvolutionalNeuralNetwork( graph: Graph, use_weights: Union[str, bool] = 'auto', use_class_weights: bool = True, node_features_number: Union[int, NoneType] = None, node_features: Union[pandas.core.frame.DataFrame, NoneType] = None, number_of_hidden_layers: int = 1, number_of_units_per_hidden_layer: Union[int, List[int]] = 16, use_dense_hidden_layers: bool = False, activations_per_hidden_layer: Union[str, List[str]] = 'relu', kernel_initializer: Union[str, keras.initializers.initializers_v2.Initializer] = 'glorot_uniform', bias_initializer: Union[str, keras.initializers.initializers_v2.Initializer] = 'zeros', kernel_regularizer: Union[str, keras.regularizers.Regularizer] = None, bias_regularizer: Union[str, keras.regularizers.Regularizer] = None, activity_regularizer: Union[str, keras.regularizers.Regularizer] = None, kernel_constraint: Union[str, keras.constraints.Constraint] = None, bias_constraint: Union[str, keras.constraints.Constraint] = None, features_dropout_rate: float = 0.5, optimizer: Union[str, keras.optimizer_v2.optimizer_v2.OptimizerV2] = 'nadam' )
View Source
    def __init__(
        self,
        graph: Graph,
        use_weights: Union[str, bool] = "auto",
        use_class_weights: bool = True,
        node_features_number: Optional[int] = None,
        node_features: Optional[pd.DataFrame] = None,
        number_of_hidden_layers: int = 1,
        number_of_units_per_hidden_layer: Union[int, List[int]] = 16,
        use_dense_hidden_layers: bool = False,
        activations_per_hidden_layer: Union[str, List[str]] = "relu",
        kernel_initializer: Union[str, Initializer] = 'glorot_uniform',
        bias_initializer: Union[str, Initializer] = 'zeros',
        kernel_regularizer: Union[str, Regularizer] = None,
        bias_regularizer: Union[str, Regularizer] = None,
        activity_regularizer: Union[str, Regularizer] = None,
        kernel_constraint: Union[str, Constraint] = None,
        bias_constraint: Union[str, Constraint] = None,
        features_dropout_rate: float = 0.5,
        optimizer: Union[str, Optimizer] = "nadam",
    ):
        """Create new GloVe-based Embedder object.

        Parameters
        -------------------------------
        graph: Graph,
            The data for which to build the model.
        use_class_weights: bool = True,
            Whether to use class weights to rebalance the loss relative to unbalanced classes.
            Learn more about class weights here: https://www.tensorflow.org/tutorials/structured_data/imbalanced_data
        number_of_units_per_hidden_layer: Union[int, List[int]] = 16,
            Number of units per hidden layer.
        use_dense_hidden_layers: bool = False,
            Whether to use dense layer for the hidden layers.
            It is useful in the context of a shallow GCN, when it is not
            feaseable to use the batch size equal to the number of nodes in the graph.
        use_weights: Union[str, bool] = "auto",
            Whether to expect weights in input to execute the graph convolution.
            The weights may be used in order to compute for instance a weighting
            using the symmetric normalized Laplacian method.
        number_of_nodes: Optional[int] = None,
            Number of nodes in the considered.
            If the node features are provided, the nodes number is extracted by the node features.
        node_features_number: Optional[int] = None,
            Number of node features.
            If the node features are provided, the features number is extracted by the node features.
        node_features: Optional[Union[np.ndarray, pd.DataFrame]] = None,
            Vector with the provided node features.
        trainable: Union[str, bool] = "auto",
            Whether to make the node features trainable.
            By default, with "auto", the embedding is trainable if no node features where provided.
        number_of_hidden_layers: int = 1,
            Number of graph convolution layer.
        number_of_units_per_hidden_layer: Union[int, List[int]] = 16,
            Number of units per hidden layer.
        kernel_initializer: Union[str, Initializer] = 'glorot_uniform',
            Initializer for the kernel weights matrix.
        bias_initializer: Union[str, Initializer] = 'zeros',
            Initializer for the bias vector.
        kernel_regularizer: Union[str, Regularizer] = None,
            Regularizer function applied to the kernel weights matrix.
        bias_regularizer: Union[str, Regularizer] = None,
            Regularizer function applied to the bias vector.
        activity_regularizer: Union[str, Regularizer] = None,
            Regularizer function applied to the output of the activation function.
        kernel_constraint: Union[str, Constraint] = None,
            Constraint function applied to the kernel matrix.
        bias_constraint: Union[str, Constraint] = None,
            Constraint function applied to the bias vector.
        features_dropout_rate: float = 0.5,
            Float between 0 and 1. Fraction of the input units to drop.

        """
        if isinstance(number_of_units_per_hidden_layer, int):
            number_of_units_per_hidden_layer = [
                number_of_units_per_hidden_layer
            ] * number_of_hidden_layers

        if isinstance(activations_per_hidden_layer, str):
            activations_per_hidden_layer = [
                activations_per_hidden_layer
            ] * number_of_hidden_layers

        if len(number_of_units_per_hidden_layer) != number_of_hidden_layers:
            raise ValueError(
                "The number of hidden layers must match"
                "the number of the hidden units per layer provided"
            )

        if len(activations_per_hidden_layer) != number_of_hidden_layers:
            raise ValueError(
                "The number of hidden layers must match"
                "the number of the activations per layer provided"
            )

        use_weights_supported_values = ("auto", True, False)
        if use_weights not in use_weights_supported_values:
            raise ValueError(
                (
                    "The provided value for `use_weights`, '{}', is not among the supported values '{}'."
                ).format(use_weights, use_weights_supported_values)
            )
        if node_features is not None and any(node_features.index != graph.get_node_names()):
            raise ValueError(
                "The provided node features DataFrame is not aligned with the "
                "provided graph nodes."
            )
        if node_features_number is None and node_features is None:
            raise ValueError(
                "Eiter the number of node features or the node features "
                "themselves must be provided."
            )
        if use_weights == "auto":
            use_weights = graph.has_edge_weights()
        if node_features is not None:
            node_features_number = node_features.shape[-1]
        self._use_weights = use_weights
        self._use_class_weights = use_class_weights
        self._node_features_number = node_features_number
        self._node_features = node_features
        self._nodes_number = graph.get_number_of_nodes()
        self._node_types_number = graph.get_node_types_number()
        number_of_units_per_hidden_layer[-1] = self._node_types_number
        self._number_of_hidden_layers = number_of_hidden_layers
        self._use_dense_hidden_layers = use_dense_hidden_layers
        self._kernel_initializer = kernel_initializer
        self._bias_initializer = bias_initializer
        self._kernel_regularizer = kernel_regularizer
        self._bias_regularizer = bias_regularizer
        self._activity_regularizer = activity_regularizer
        self._kernel_constraint = kernel_constraint
        self._bias_constraint = bias_constraint
        self._features_dropout_rate = features_dropout_rate
        self._number_of_units_per_hidden_layer = number_of_units_per_hidden_layer
        self._multi_label = graph.has_multilabel_node_types()
        activations_per_hidden_layer[-1] = "sigmoid" if self._multi_label or self._node_types_number == 1 else "softmax"
        self._activations_per_hidden_layer = activations_per_hidden_layer
        self._optimizer = optimizer
        self._adjacency_matrix = graph_to_sparse_tensor(
            graph,
            use_weights=self._use_weights
        )
        self._model = self._build_model()
        self._compile_model()

Create new GloVe-based Embedder object.

Parameters
  • graph (Graph,): The data for which to build the model.
  • use_class_weights (bool = True,): Whether to use class weights to rebalance the loss relative to unbalanced classes. Learn more about class weights here: https://www.tensorflow.org/tutorials/structured_data/imbalanced_data
  • number_of_units_per_hidden_layer (Union[int, List[int]] = 16,): Number of units per hidden layer.
  • use_dense_hidden_layers (bool = False,): Whether to use dense layer for the hidden layers. It is useful in the context of a shallow GCN, when it is not feaseable to use the batch size equal to the number of nodes in the graph.
  • use_weights (Union[str, bool] = "auto",): Whether to expect weights in input to execute the graph convolution. The weights may be used in order to compute for instance a weighting using the symmetric normalized Laplacian method.
  • number_of_nodes (Optional[int] = None,): Number of nodes in the considered. If the node features are provided, the nodes number is extracted by the node features.
  • node_features_number (Optional[int] = None,): Number of node features. If the node features are provided, the features number is extracted by the node features.
  • node_features (Optional[Union[np.ndarray, pd.DataFrame]] = None,): Vector with the provided node features.
  • trainable (Union[str, bool] = "auto",): Whether to make the node features trainable. By default, with "auto", the embedding is trainable if no node features where provided.
  • number_of_hidden_layers (int = 1,): Number of graph convolution layer.
  • number_of_units_per_hidden_layer (Union[int, List[int]] = 16,): Number of units per hidden layer.
  • kernel_initializer (Union[str, Initializer] = 'glorot_uniform',): Initializer for the kernel weights matrix.
  • bias_initializer (Union[str, Initializer] = 'zeros',): Initializer for the bias vector.
  • kernel_regularizer (Union[str, Regularizer] = None,): Regularizer function applied to the kernel weights matrix.
  • bias_regularizer (Union[str, Regularizer] = None,): Regularizer function applied to the bias vector.
  • activity_regularizer (Union[str, Regularizer] = None,): Regularizer function applied to the output of the activation function.
  • kernel_constraint (Union[str, Constraint] = None,): Constraint function applied to the kernel matrix.
  • bias_constraint (Union[str, Constraint] = None,): Constraint function applied to the bias vector.
  • features_dropout_rate (float = 0.5,): Float between 0 and 1. Fraction of the input units to drop.
#   name: str
#   def summary(self):
View Source
    def summary(self):
        """Print model summary."""
        self._model.summary()

Print model summary.

#   def run_batch_size_check(self, batch_size: Union[int, str]) -> int:
View Source
    def run_batch_size_check(self, batch_size: Union[int, str]) -> int:
        """Runs check for valid batch size given the model.

        Parameters
        -----------------
        batch_size: int,
            The batch size to check.

        Raises
        -----------------
        ValueError,
            If the given batch size is not compatible with the
            current model.

        Returns
        -----------------
        Validated batch size.
        """
        if batch_size == "auto":
            return self._nodes_number

        if self._number_of_hidden_layers != 1 and not self._use_dense_hidden_layers and batch_size != self._nodes_number:
            raise ValueError(
                "If the number of GCN layers is greater than 1, "
                "the batch size must be equal to the number of "
                "nodes in the graph.\n"
                "Alternatively you can use Dense layers for the "
                "hidden layers."
            )

        return batch_size

Runs check for valid batch size given the model.

Parameters
  • batch_size (int,): The batch size to check.
Raises
  • ValueError,: If the given batch size is not compatible with the current model.
Returns
  • Validated batch size.
#   def fit( self, train_graph: Graph, batch_size: Union[int, str] = 'auto', validation_freq: int = 1, early_stopping_min_delta: float = 0.001, early_stopping_patience: int = 10, reduce_lr_min_delta: float = 0.001, reduce_lr_patience: int = 5, validation_graph: Graph = None, epochs: int = 1000, early_stopping_monitor: str = 'loss', early_stopping_mode: str = 'min', reduce_lr_monitor: str = 'loss', reduce_lr_mode: str = 'min', reduce_lr_factor: float = 0.9, verbose: int = 2, **kwargs: Dict ) -> pandas.core.frame.DataFrame:
View Source
    def fit(
        self,
        train_graph: Graph,
        batch_size: Union[int, str] = "auto",
        validation_freq: int = 1,
        early_stopping_min_delta: float = 0.001,
        early_stopping_patience: int = 10,
        reduce_lr_min_delta: float = 0.001,
        reduce_lr_patience: int = 5,
        validation_graph: Graph = None,
        epochs: int = 1000,
        early_stopping_monitor: str = "loss",
        early_stopping_mode: str = "min",
        reduce_lr_monitor: str = "loss",
        reduce_lr_mode: str = "min",
        reduce_lr_factor: float = 0.9,
        verbose: int = 2,
        **kwargs: Dict
    ) -> pd.DataFrame:
        """Return pandas dataframe with training history.

        Parameters
        -----------------------
        train_graph: Graph,
            Graph to use for the training.
        batch_size: Union[int, str] = "auto",
            Batch size for the training epochs.
            If the model has a single GCN layer it is possible
            to specify a variable batch size.
        validation_freq: int = 1,
            The frequency when to run the validation.
            Note that in sparse tensors, this step is apparently
            extremely slow and seems to be happening in GPU.
            You may want to increase it to a higher value than one.
        early_stopping_min_delta: float,
            Minimum delta of metric to stop the training.
        early_stopping_patience: int,
            Number of epochs to wait for when the given minimum delta is not
            achieved after which trigger early stopping.
        reduce_lr_min_delta: float,
            Minimum delta of metric to reduce learning rate.
        reduce_lr_patience: int,
            Number of epochs to wait for when the given minimum delta is not
            achieved after which reducing learning rate.
        validation_graph: Graph = None,
            Tuple to use for the validation.
        epochs: int = 10000,
            Epochs to train the model for.
        early_stopping_monitor: str = "loss",
            Metric to monitor for early stopping.
        early_stopping_mode: str = "min",
            Direction of the variation of the monitored metric for early stopping.
        reduce_lr_monitor: str = "loss",
            Metric to monitor for reducing learning rate.
        reduce_lr_mode: str = "min",
            Direction of the variation of the monitored metric for learning rate.
        reduce_lr_factor: float = 0.9,
            Factor for reduction of learning rate.
        verbose: int = 1,
            Wethever to show the loading bar.
            Specifically, the options are:
            * 0 or False: No loading bar.
            * 1 or True: Showing only the loading bar for the epochs.
            * 2: Showing loading bar for both epochs and batches.
        **kwargs: Dict,
            Additional kwargs to pass to the Keras fit call.

        Raises
        -----------------------
        ValueError,
            If given verbose value is not within the available set (-1, 0, 1).

        Returns
        -----------------------
        Dataframe with training history.
        """
        try:
            from tqdm.keras import TqdmCallback
            traditional_verbose = False
        except AttributeError:
            traditional_verbose = True
        verbose = validate_verbose(verbose)

        if validation_graph:
            validation_data = (
                self._adjacency_matrix,
                validation_graph.get_one_hot_encoded_node_types().astype(float),
                # This is a known hack to get around limitations from the current
                # implementation that handles the sample weights in TensorFlow.
                pd.Series(validation_graph.get_known_node_types_mask().astype(float))
            )
        else:
            validation_data = None

        if self._use_class_weights:
            class_weight = {
                node_type_id: self._nodes_number / count / self._node_types_number
                for node_type_id, count in train_graph.get_node_type_id_counts_hashmap().items()
            }
        else:
            class_weight = None

        callbacks = kwargs.pop("callbacks", ())
        return pd.DataFrame(self._model.fit(
            self._adjacency_matrix, train_graph.get_one_hot_encoded_node_types().astype(float),
            # This is a known hack to get around limitations from the current
            # implementation that handles the sample weights in TensorFlow.
            sample_weight=pd.Series(train_graph.get_known_node_types_mask().astype(float)),
            validation_data=validation_data,
            epochs=epochs,
            verbose=traditional_verbose and verbose > 0,
            batch_size=self.run_batch_size_check(batch_size),
            validation_freq=validation_freq,
            class_weight=class_weight,
            callbacks=[
                EarlyStopping(
                    monitor=early_stopping_monitor,
                    min_delta=early_stopping_min_delta,
                    patience=early_stopping_patience,
                    mode=early_stopping_mode,
                ),
                ReduceLROnPlateau(
                    monitor=reduce_lr_monitor,
                    min_delta=reduce_lr_min_delta,
                    patience=reduce_lr_patience,
                    factor=reduce_lr_factor,
                    mode=reduce_lr_mode,
                ),
                *((TqdmCallback(verbose=verbose-1),)
                  if not traditional_verbose and verbose > 0 else ()),
                *callbacks
            ],
            **kwargs
        ).history)

Return pandas dataframe with training history.

Parameters
  • train_graph (Graph,): Graph to use for the training.
  • batch_size (Union[int, str] = "auto",): Batch size for the training epochs. If the model has a single GCN layer it is possible to specify a variable batch size.
  • validation_freq (int = 1,): The frequency when to run the validation. Note that in sparse tensors, this step is apparently extremely slow and seems to be happening in GPU. You may want to increase it to a higher value than one.
  • early_stopping_min_delta (float,): Minimum delta of metric to stop the training.
  • early_stopping_patience (int,): Number of epochs to wait for when the given minimum delta is not achieved after which trigger early stopping.
  • reduce_lr_min_delta (float,): Minimum delta of metric to reduce learning rate.
  • reduce_lr_patience (int,): Number of epochs to wait for when the given minimum delta is not achieved after which reducing learning rate.
  • validation_graph (Graph = None,): Tuple to use for the validation.
  • epochs (int = 10000,): Epochs to train the model for.
  • early_stopping_monitor (str = "loss",): Metric to monitor for early stopping.
  • early_stopping_mode (str = "min",): Direction of the variation of the monitored metric for early stopping.
  • reduce_lr_monitor (str = "loss",): Metric to monitor for reducing learning rate.
  • reduce_lr_mode (str = "min",): Direction of the variation of the monitored metric for learning rate.
  • reduce_lr_factor (float = 0.9,): Factor for reduction of learning rate.
  • verbose (int = 1,): Wethever to show the loading bar. Specifically, the options are:
    • 0 or False: No loading bar.
    • 1 or True: Showing only the loading bar for the epochs.
    • 2: Showing loading bar for both epochs and batches.
  • **kwargs (Dict,): Additional kwargs to pass to the Keras fit call.
Raises
  • ValueError,: If given verbose value is not within the available set (-1, 0, 1).
Returns
  • Dataframe with training history.
#   def predict( self, graph: Graph, *args: List, batch_size: Union[int, str] = 'auto', **kwargs: Dict ) -> pandas.core.frame.DataFrame:
View Source
    def predict(
        self,
        graph: Graph,
        *args: List,
        batch_size: Union[int, str] = "auto",
        **kwargs: Dict
    ) -> pd.DataFrame:
        """Run predictions on the provided graph."""
        predictions = self._model.predict(
            self._adjacency_matrix,
            *args,
            batch_size=self.run_batch_size_check(batch_size),
            **kwargs
        )
        return pd.DataFrame(
            predictions,
            columns=graph.get_unique_node_type_names(),
            index=graph.get_node_names()
        )

Run predictions on the provided graph.

#   def evaluate( self, graph: Graph, *args: List, batch_size: Union[int, str] = 'auto', **kwargs: Dict ) -> Dict[str, float]:
View Source
    def evaluate(
        self,
        graph: Graph,
        *args: List,
        batch_size: Union[int, str] = "auto",
        **kwargs: Dict
    ) -> Dict[str, float]:
        """Run evaluation of the model over the provided graph."""
        return dict(zip(
            self._model.metrics_names,
            self._model.evaluate(
                self._adjacency_matrix,
                graph.get_one_hot_encoded_node_types(),
                * args,
                # This is a known hack to get around limitations from the current
                # implementation that handles the sample weights in TensorFlow.
                sample_weight=pd.Series(
                    graph.get_known_node_types_mask().astype(float)),
                batch_size=self.run_batch_size_check(batch_size),
                **kwargs
            )
        ))

Run evaluation of the model over the provided graph.