Source code for qstn.utilities.prompt_perturbations

import random
import re
import string

from qstn.inference.survey_inference import batch_generation



[docs]
def key_typos(text: str, probability: float = 0.1) -> str:
    """
    Randomly replaces characters with random alphabet letters to simulate typos.
    Args:
        text (str): The input text to perturb.
        probability (float): The probability of replacing each character.
    Returns:
        str: The text with random character replacements based on the given probability.
    """
    if not text:
        return text

    # Get all possible letters (a-z and A-Z)
    alphabet = string.ascii_letters

    text_list = list(text)

    for i, char in enumerate(text_list):
        # We check char.isalpha() so we don't replace spaces or punctuation
        if char.isalpha() and random.random() < probability:
            text_list[i] = random.choice(alphabet)

    return "".join(text_list)




[docs]
def keyboard_typos(text: str, probability: float = 0.1) -> str:
    """
    Introduces typos based on keyboard proximity.
    Args:
        text (str): The input text to perturb.
        probability (float): The probability of introducing a typo for each character.
    Returns:
        str: The text with keyboard-based typos introduced based on the given probability.
    """
    keyboard_neighbors = {
        "a": "qwsz",
        "b": "vghn",
        "c": "xdfv",
        "d": "ersfcx",
        "e": "wsdr",
        "f": "rtgdvc",
        "g": "tyfhbv",
        "h": "yugjbn",
        "i": "ujko",
        "j": "uikhnm",
        "k": "ijolm",
        "l": "opk",
        "m": "njk",
        "n": "bhjm",
        "o": "iklp",
        "p": "ol",
        "q": "wa",
        "r": "edft",
        "s": "awedz",
        "t": "rfgy",
        "u": "yhji",
        "v": "cfgb",
        "w": "qase",
        "x": "zsdc",
        "y": "tugh",
        "z": "asx",
    }

    if not text:
        return text

    text_list = list(text)
    for i in range(len(text_list)):
        char = text_list[i].lower()
        if char in keyboard_neighbors and random.random() < probability:
            neighbors = keyboard_neighbors[char]
            typo_char = random.choice(neighbors)
            # Preserve original case
            if text_list[i].isupper():
                typo_char = typo_char.upper()
            text_list[i] = typo_char
    return "".join(text_list)




[docs]
def letter_swaps(text: str, probability: float = 0.1) -> str:
    """
    Randomly swaps adjacent letters in the text.
    Args:
        text (str): The input text to perturb.
        probability (float): The probability of swapping each adjacent letter pair.
    Returns:
        str: The text with adjacent letters swapped based on the given probability.
    """
    if not text:
        return text

    text_list = list(text)
    i = 0
    while i < len(text_list) - 1:
        if random.random() < probability:
            text_list[i], text_list[i + 1] = text_list[i + 1], text_list[i]
            i += 2  # Skip next character to avoid double swapping
        else:
            i += 1
    return "".join(text_list)




[docs]
def make_synonyms(all_prompts: list[str], model: str, instruction: str) -> str:
    """
    Uses a language model to replace words with their synonyms.
    Args:
        all_prompts (List[str]): The input prompts as a list to perturb.
        model (str): The language model to use for generating synonyms as a vllm LLM object.
        instruction (str): The instruction prompt for the model.
    Returns:
        List[str]: The prompts with words replaced by their synonyms as a list of strings.
    """
    system_msg = (
        "You are a helpful assistant that replaces words with their synonyms "
        "while preserving the original meaning."
    )

    all_segments_to_perturb = []
    prompt_maps = []

    for prompt in all_prompts:
        parts = re.split(r"(\{.*?\})", prompt)
        structure = []  # Stores (is_placeholder, content)
        for part in parts:
            is_placeholder = part.startswith("{") and part.endswith("}")
            if not is_placeholder and part.strip():
                structure.append((False, len(all_segments_to_perturb)))
                all_segments_to_perturb.append(part)
            else:
                structure.append((True, part))
        prompt_maps.append(structure)

    flat_results, _, _ = batch_generation(
        model=model,
        system_messages=[system_msg] * len(all_segments_to_perturb),
        prompts=[instruction + text for text in all_segments_to_perturb],
        # response_generation_method=[ResponseGenerationMethod()] * len(all_segments_to_perturb),
        max_tokens=1024,
    )

    final_prompts = []
    for structure in prompt_maps:
        reconstructed = []
        for is_placeholder, content in structure:
            if is_placeholder:
                reconstructed.append(content)
            else:
                reconstructed.append(flat_results[content])
        final_prompts.append("".join(reconstructed))

    return final_prompts




[docs]
def make_paraphrase(all_prompts: list[str], model: str, instruction: str) -> str:
    """
    Uses a language model to paraphrase the input text.
    Args:
        all_prompts (List[str]): The input prompts as a list to perturb.
        model (str): The language model to use for paraphrasing as a vllm LLM object.
        instruction (str): The instruction prompt for the model.
    Returns:
        List[str]: The paraphrased text as a list of strings.
    """
    system_msg = (
        "You are a helpful assistant that paraphrases text while preserving the original meaning."
    )

    all_segments_to_perturb = []
    prompt_maps = []

    for prompt in all_prompts:
        parts = re.split(r"(\{.*?\})", prompt)
        structure = []  # Stores (is_placeholder, content)
        for part in parts:
            is_placeholder = part.startswith("{") and part.endswith("}")
            if not is_placeholder and part.strip():
                structure.append((False, len(all_segments_to_perturb)))
                all_segments_to_perturb.append(part)
            else:
                structure.append((True, part))
        prompt_maps.append(structure)

    flat_results, _, _ = batch_generation(
        model=model,
        system_messages=[system_msg] * len(all_segments_to_perturb),
        prompts=[instruction + text for text in all_segments_to_perturb],
        # response_generation_method=[ResponseGenerationMethod()] * len(all_segments_to_perturb),
        max_tokens=1024,
    )

    final_prompts = []
    for structure in prompt_maps:
        reconstructed = []
        for is_placeholder, content in structure:
            if is_placeholder:
                reconstructed.append(content)
            else:
                reconstructed.append(flat_results[content])
        final_prompts.append("".join(reconstructed))

    return final_prompts




[docs]
def apply_safe_perturbation(prompts: list, perturbation_func, **kwargs):
    """
    Splits list of prompts by curly brace placeholders (e.g., {PROMPT_OPTIONS}).
    Applies the perturbation_func ONLY to the prompts segments, protecting the keys.

    Args:
        prompts (List[str]): The input prompts containing placeholders.
        perturbation_func (function): The function to apply to non-placeholder text.
        **kwargs: Additional keyword arguments to pass to the perturbation
            function (e.g., probability).
    Returns:
        List[str]: The prompts with perturbations applied safely.
    """
    import re

    if not prompts:
        return prompts

    if perturbation_func in [make_synonyms, make_paraphrase]:
        print("Using batch perturbation function:", perturbation_func)
        final_prompts = prompts
        if perturbation_func == make_synonyms:
            final_prompts = make_synonyms(
                all_prompts=prompts,
                model=kwargs.get("model"),
                instruction=kwargs.get("instruction"),
            )
        elif perturbation_func == make_paraphrase:
            final_prompts = make_paraphrase(
                all_prompts=prompts,
                model=kwargs.get("model"),
                instruction=kwargs.get("instruction"),
            )
        return final_prompts
    else:
        perturbed_prompts = []
        for prompt in prompts:

            parts = re.split(r"(\{.*?\})", prompt)

            processed_parts = []
            for part in parts:
                # Check if this part is a placeholder
                if part.startswith("{") and part.endswith("}"):
                    # Append exactly as is
                    processed_parts.append(part)
                else:
                    # Apply the typo function
                    processed_parts.append(perturbation_func(part, **kwargs))

            perturbed_prompts.append("".join(processed_parts))

        return perturbed_prompts