import random
import re
import string
from qstn.inference.survey_inference import batch_generation
[docs]
def key_typos(text: str, probability: float = 0.1) -> str:
"""
Randomly replaces characters with random alphabet letters to simulate typos.
Args:
text (str): The input text to perturb.
probability (float): The probability of replacing each character.
Returns:
str: The text with random character replacements based on the given probability.
"""
if not text:
return text
# Get all possible letters (a-z and A-Z)
alphabet = string.ascii_letters
text_list = list(text)
for i, char in enumerate(text_list):
# We check char.isalpha() so we don't replace spaces or punctuation
if char.isalpha() and random.random() < probability:
text_list[i] = random.choice(alphabet)
return "".join(text_list)
[docs]
def keyboard_typos(text: str, probability: float = 0.1) -> str:
"""
Introduces typos based on keyboard proximity.
Args:
text (str): The input text to perturb.
probability (float): The probability of introducing a typo for each character.
Returns:
str: The text with keyboard-based typos introduced based on the given probability.
"""
keyboard_neighbors = {
"a": "qwsz",
"b": "vghn",
"c": "xdfv",
"d": "ersfcx",
"e": "wsdr",
"f": "rtgdvc",
"g": "tyfhbv",
"h": "yugjbn",
"i": "ujko",
"j": "uikhnm",
"k": "ijolm",
"l": "opk",
"m": "njk",
"n": "bhjm",
"o": "iklp",
"p": "ol",
"q": "wa",
"r": "edft",
"s": "awedz",
"t": "rfgy",
"u": "yhji",
"v": "cfgb",
"w": "qase",
"x": "zsdc",
"y": "tugh",
"z": "asx",
}
if not text:
return text
text_list = list(text)
for i in range(len(text_list)):
char = text_list[i].lower()
if char in keyboard_neighbors and random.random() < probability:
neighbors = keyboard_neighbors[char]
typo_char = random.choice(neighbors)
# Preserve original case
if text_list[i].isupper():
typo_char = typo_char.upper()
text_list[i] = typo_char
return "".join(text_list)
[docs]
def letter_swaps(text: str, probability: float = 0.1) -> str:
"""
Randomly swaps adjacent letters in the text.
Args:
text (str): The input text to perturb.
probability (float): The probability of swapping each adjacent letter pair.
Returns:
str: The text with adjacent letters swapped based on the given probability.
"""
if not text:
return text
text_list = list(text)
i = 0
while i < len(text_list) - 1:
if random.random() < probability:
text_list[i], text_list[i + 1] = text_list[i + 1], text_list[i]
i += 2 # Skip next character to avoid double swapping
else:
i += 1
return "".join(text_list)
[docs]
def make_synonyms(all_prompts: list[str], model: str, instruction: str) -> str:
"""
Uses a language model to replace words with their synonyms.
Args:
all_prompts (List[str]): The input prompts as a list to perturb.
model (str): The language model to use for generating synonyms as a vllm LLM object.
instruction (str): The instruction prompt for the model.
Returns:
List[str]: The prompts with words replaced by their synonyms as a list of strings.
"""
system_msg = (
"You are a helpful assistant that replaces words with their synonyms "
"while preserving the original meaning."
)
all_segments_to_perturb = []
prompt_maps = []
for prompt in all_prompts:
parts = re.split(r"(\{.*?\})", prompt)
structure = [] # Stores (is_placeholder, content)
for part in parts:
is_placeholder = part.startswith("{") and part.endswith("}")
if not is_placeholder and part.strip():
structure.append((False, len(all_segments_to_perturb)))
all_segments_to_perturb.append(part)
else:
structure.append((True, part))
prompt_maps.append(structure)
flat_results, _, _ = batch_generation(
model=model,
system_messages=[system_msg] * len(all_segments_to_perturb),
prompts=[instruction + text for text in all_segments_to_perturb],
# response_generation_method=[ResponseGenerationMethod()] * len(all_segments_to_perturb),
max_tokens=1024,
)
final_prompts = []
for structure in prompt_maps:
reconstructed = []
for is_placeholder, content in structure:
if is_placeholder:
reconstructed.append(content)
else:
reconstructed.append(flat_results[content])
final_prompts.append("".join(reconstructed))
return final_prompts
[docs]
def make_paraphrase(all_prompts: list[str], model: str, instruction: str) -> str:
"""
Uses a language model to paraphrase the input text.
Args:
all_prompts (List[str]): The input prompts as a list to perturb.
model (str): The language model to use for paraphrasing as a vllm LLM object.
instruction (str): The instruction prompt for the model.
Returns:
List[str]: The paraphrased text as a list of strings.
"""
system_msg = (
"You are a helpful assistant that paraphrases text while preserving the original meaning."
)
all_segments_to_perturb = []
prompt_maps = []
for prompt in all_prompts:
parts = re.split(r"(\{.*?\})", prompt)
structure = [] # Stores (is_placeholder, content)
for part in parts:
is_placeholder = part.startswith("{") and part.endswith("}")
if not is_placeholder and part.strip():
structure.append((False, len(all_segments_to_perturb)))
all_segments_to_perturb.append(part)
else:
structure.append((True, part))
prompt_maps.append(structure)
flat_results, _, _ = batch_generation(
model=model,
system_messages=[system_msg] * len(all_segments_to_perturb),
prompts=[instruction + text for text in all_segments_to_perturb],
# response_generation_method=[ResponseGenerationMethod()] * len(all_segments_to_perturb),
max_tokens=1024,
)
final_prompts = []
for structure in prompt_maps:
reconstructed = []
for is_placeholder, content in structure:
if is_placeholder:
reconstructed.append(content)
else:
reconstructed.append(flat_results[content])
final_prompts.append("".join(reconstructed))
return final_prompts
[docs]
def apply_safe_perturbation(prompts: list, perturbation_func, **kwargs):
"""
Splits list of prompts by curly brace placeholders (e.g., {PROMPT_OPTIONS}).
Applies the perturbation_func ONLY to the prompts segments, protecting the keys.
Args:
prompts (List[str]): The input prompts containing placeholders.
perturbation_func (function): The function to apply to non-placeholder text.
**kwargs: Additional keyword arguments to pass to the perturbation
function (e.g., probability).
Returns:
List[str]: The prompts with perturbations applied safely.
"""
import re
if not prompts:
return prompts
if perturbation_func in [make_synonyms, make_paraphrase]:
print("Using batch perturbation function:", perturbation_func)
final_prompts = prompts
if perturbation_func == make_synonyms:
final_prompts = make_synonyms(
all_prompts=prompts,
model=kwargs.get("model"),
instruction=kwargs.get("instruction"),
)
elif perturbation_func == make_paraphrase:
final_prompts = make_paraphrase(
all_prompts=prompts,
model=kwargs.get("model"),
instruction=kwargs.get("instruction"),
)
return final_prompts
else:
perturbed_prompts = []
for prompt in prompts:
parts = re.split(r"(\{.*?\})", prompt)
processed_parts = []
for part in parts:
# Check if this part is a placeholder
if part.startswith("{") and part.endswith("}"):
# Append exactly as is
processed_parts.append(part)
else:
# Apply the typo function
processed_parts.append(perturbation_func(part, **kwargs))
perturbed_prompts.append("".join(processed_parts))
return perturbed_prompts