Source code for qstn.prompt_builder

import copy
import random
import warnings
from collections.abc import Sequence
from dataclasses import dataclass, replace
from enum import StrEnum
from string import ascii_lowercase, ascii_uppercase
from typing import Any, Literal, Self, overload

import pandas as pd

from ._questionnaire_loader import (
    QuestionnaireLoaderColumn,
    optional_bool,
    optional_int,
    optional_list,
    optional_row_value,
    optional_template,
    row_has_value,
)
from .inference.response_generation import (
    ChoiceResponseGenerationMethod,
    JSONReasoningResponseGenerationMethod,
    JSONSingleResponseGenerationMethod,
    JSONVerbalizedDistribution,
    LogprobResponseGenerationMethod,
    ResponseGenerationMethod,
    resolve_battery_response_generation_method,
)
from .utilities import constants, placeholder, prompt_templates
from .utilities.constants import QuestionnairePresentation
from .utilities.survey_objects import AnswerOptions, AnswerTexts, QuestionnaireItem
from .utilities.utils import safe_format_with_regex


[docs] class ResponseGenerationPreset(StrEnum): """Named response-generation methods supported by questionnaire loading.""" NONE = "none" CHOICE = "choice" LOGPROB = "logprob" JSON_SINGLE = "json_single" JSON_REASONING = "json_reasoning" JSON_DISTRIBUTION = "json_distribution"
[docs] @dataclass(frozen=True) class BaseModelPromptTemplate: """Template used to render chat-style turns for base-model prompts.""" user_prefix: str | None = "User:" assistant_prefix: str | None = "Assistant:" separator: str = "\n" system_prefix: str | None = None
def _render_prefixed(prefix: str | None, content: str) -> str: """Render a single prompt block, preserving empty prefixes and content.""" if prefix is None: return content return f"{prefix}\n{content}"
[docs] def messages_to_base_model_prompt( messages: Sequence[dict[str, str]], prompt_template: BaseModelPromptTemplate | None = None, ) -> str: """Render chat-style messages into a plain prompt for base models.""" template = prompt_template or BaseModelPromptTemplate() blocks: list[str] = [] for message in messages: role = message["role"] content = message["content"] if role == "system": blocks.append(_render_prefixed(template.system_prefix, content)) elif role == "user": blocks.append(_render_prefixed(template.user_prefix, content)) elif role == "assistant": blocks.append(_render_prefixed(template.assistant_prefix, content)) else: raise ValueError(f"Unsupported message role for base-model rendering: {role}") if template.assistant_prefix is not None: blocks.append(template.assistant_prefix) return template.separator.join(blocks)
def _build_response_generation_method( row: pd.Series, item_id: Any, ) -> ResponseGenerationMethod | None: column = QuestionnaireLoaderColumn.RESPONSE_GENERATION_METHOD value = optional_row_value(row, column) if value is None: return None if isinstance(value, ResponseGenerationMethod): return value preset_value = str(value).strip().lower() try: preset = ResponseGenerationPreset(preset_value) except ValueError as exc: supported = ", ".join(preset.value for preset in ResponseGenerationPreset) raise ValueError( f"Unsupported response_generation_method '{value}' for questionnaire_item_id " f"'{item_id}'. Supported presets are: {supported}." ) from exc if preset == ResponseGenerationPreset.NONE: return None output_index_only = optional_bool( row, QuestionnaireLoaderColumn.OUTPUT_INDEX_ONLY, item_id, default=False, ) constrain_answer_options = optional_bool( row, QuestionnaireLoaderColumn.CONSTRAIN_ANSWER_OPTIONS, item_id, default=True, ) if preset == ResponseGenerationPreset.CHOICE: return ChoiceResponseGenerationMethod( allowed_choices_template="{options}", output_index_only=output_index_only, ) if preset == ResponseGenerationPreset.LOGPROB: return LogprobResponseGenerationMethod( allowed_choices_template="{options}", output_index_only=output_index_only, ) if preset == ResponseGenerationPreset.JSON_SINGLE: return JSONSingleResponseGenerationMethod( output_index_only=output_index_only, constrain_answer_options=constrain_answer_options, ) if preset == ResponseGenerationPreset.JSON_REASONING: return JSONReasoningResponseGenerationMethod( output_index_only=output_index_only, constrain_answer_options=constrain_answer_options, ) if preset == ResponseGenerationPreset.JSON_DISTRIBUTION: return JSONVerbalizedDistribution(output_index_only=output_index_only) return None def _has_likert_config(row: pd.Series) -> bool: return any( row_has_value(row, column) for column in QuestionnaireLoaderColumn if column.value.startswith("likert_") ) def _build_answer_options_from_row(row: pd.Series, item_id: Any) -> AnswerOptions | None: answer_texts = optional_list(row, QuestionnaireLoaderColumn.ANSWER_TEXTS, item_id) answer_codes = optional_list(row, QuestionnaireLoaderColumn.ANSWER_CODES, item_id) response_generation_method = _build_response_generation_method(row, item_id) list_prompt_template = optional_template( row, QuestionnaireLoaderColumn.LIST_PROMPT_TEMPLATE, prompt_templates.LIST_OPTIONS_DEFAULT, ) scale_prompt_template = optional_template( row, QuestionnaireLoaderColumn.SCALE_PROMPT_TEMPLATE, prompt_templates.SCALE_OPTIONS_DEFAULT, ) index_answer_separator = optional_template( row, QuestionnaireLoaderColumn.INDEX_ANSWER_SEPARATOR, ": ", ) options_separator = optional_template(row, QuestionnaireLoaderColumn.OPTIONS_SEPARATOR, ", ") if _has_likert_config(row): only_from_to_scale = optional_bool( row, QuestionnaireLoaderColumn.LIKERT_ONLY_FROM_TO_SCALE, item_id, default=False, ) explicit_n = optional_int(row, QuestionnaireLoaderColumn.LIKERT_N, item_id) if explicit_n is None: if only_from_to_scale: raise ValueError( f"Column '{QuestionnaireLoaderColumn.LIKERT_N}' is required for " f"from-to Likert scales on questionnaire_item_id '{item_id}'." ) if answer_texts is None: raise ValueError( f"Column '{QuestionnaireLoaderColumn.LIKERT_N}' is required when " f"'{QuestionnaireLoaderColumn.ANSWER_TEXTS}' is missing for " f"questionnaire_item_id '{item_id}'." ) n = len(answer_texts) else: n = explicit_n idx_type = str( optional_row_value(row, QuestionnaireLoaderColumn.LIKERT_IDX_TYPE, "integer") ) if idx_type not in {"char_lower", "char_upper", "integer", "no_index"}: raise ValueError( f"Column '{QuestionnaireLoaderColumn.LIKERT_IDX_TYPE}' for " f"questionnaire_item_id '{item_id}' must be one of: " "char_lower, char_upper, integer, no_index." ) return generate_likert_options( n=n, answer_texts=answer_texts, only_from_to_scale=only_from_to_scale, random_order=optional_bool( row, QuestionnaireLoaderColumn.LIKERT_RANDOM_ORDER, item_id, default=False, ), reversed_order=optional_bool( row, QuestionnaireLoaderColumn.LIKERT_REVERSED_ORDER, item_id, default=False, ), even_order=optional_bool( row, QuestionnaireLoaderColumn.LIKERT_EVEN_ORDER, item_id, default=False, ), add_middle_category=optional_bool( row, QuestionnaireLoaderColumn.LIKERT_ADD_MIDDLE_CATEGORY, item_id, default=False, ), str_middle_cat=str( optional_row_value( row, QuestionnaireLoaderColumn.LIKERT_MIDDLE_CATEGORY, "Neutral", ) ), add_refusal=optional_bool( row, QuestionnaireLoaderColumn.LIKERT_ADD_REFUSAL, item_id, default=False, ), refusal_code=str( optional_row_value(row, QuestionnaireLoaderColumn.LIKERT_REFUSAL_CODE, "-99") ), start_idx=optional_int( row, QuestionnaireLoaderColumn.LIKERT_START_IDX, item_id, default=1, ), list_prompt_template=list_prompt_template, scale_prompt_template=scale_prompt_template, index_answer_separator=index_answer_separator, options_separator=options_separator, idx_type=idx_type, response_generation_method=response_generation_method, ) if answer_texts is None and answer_codes is None: if response_generation_method is not None: raise ValueError( f"questionnaire_item_id '{item_id}' defines a response_generation_method " "but no answer_texts or answer_codes." ) return None if ( answer_texts is not None and answer_codes is not None and len(answer_texts) != len(answer_codes) ): raise ValueError( f"answer_texts and answer_codes must have the same length for " f"questionnaire_item_id '{item_id}'." ) answer_texts_object = AnswerTexts( answer_texts=answer_texts, indices=answer_codes, index_answer_seperator=index_answer_separator, option_seperators=options_separator, ) return AnswerOptions( answer_texts=answer_texts_object, list_prompt_template=list_prompt_template, scale_prompt_template=scale_prompt_template, response_generation_method=response_generation_method, )
[docs] class LLMPrompt: """ Main class for setting up and managing the prompt in the LLM experiment. This class handles loading questions from a predefined questionnaire, preparing prompts, managing answer options, and generating prompt structures for different interview types. """ DEFAULT_QUESTIONNAIRE_ID: str = "Questionnaire" DEFAULT_SYSTEM_PROMPT: str = ( "You will be given questions and possible answer options for each. " "Please reason about each question before answering." ) DEFAULT_TASK_INSTRUCTION: str = "" DEFAULT_JSON_STRUCTURE: list[str] = ["reasoning", "answer"] DEFAULT_PROMPT_STRUCTURE: str = f"{placeholder.PROMPT_QUESTIONS}\n{placeholder.PROMPT_OPTIONS}" def __init__( self, questionnaire_source: str | pd.DataFrame = None, questionnaire_name: str = DEFAULT_QUESTIONNAIRE_ID, system_prompt: str | None = DEFAULT_SYSTEM_PROMPT, prompt: str = DEFAULT_PROMPT_STRUCTURE, verbose: bool = False, seed: int = 42, ): """ Initialize an LLMPrompt instance. Either a path to a csv file or a pandas dataframe can be provided to structure the questionnaire. Question structure can later be modified with explicit methods such as `insert_questions`, `replace_question`, and `remove_question`. Args: questionnaire_source (str/pd.Dataframe): Path to the CSV file containing the questionnaire structure and questions. questionnaire_name (str): Name/ID for the questionnaire. system_prompt (str | None): System prompt for all questions. Set to `None` to omit a system message. prompt (str): Prompt for all questions. verbose (bool): Deprecated. Use `qstn.logger.configure_logging` to enable logging output. seed (int): Random seed for reproducibility. """ if verbose: warnings.warn( "`verbose` is deprecated and will be removed in a future release. " "Use `qstn.logger.configure_logging` to enable logging output.", DeprecationWarning, stacklevel=2, ) random.seed(seed) self._questions: list[QuestionnaireItem] = [] if self._check_valid_questionnaire(questionnaire_source): self.load_questionnaire_format(questionnaire_source=questionnaire_source) self.verbose: bool = verbose self.questionnaire_name: str = questionnaire_name self.system_prompt: str | None = system_prompt self.prompt: str = prompt self.base_model_prompt_template: BaseModelPromptTemplate | None = None def _check_valid_questionnaire(self, questionnaire_source: str | pd.DataFrame = None) -> bool: # No Object if questionnaire_source is None: return False # Empty String if isinstance(questionnaire_source, str) and not questionnaire_source: return False # Empty Dataframe if isinstance(questionnaire_source, pd.DataFrame): if questionnaire_source.empty: warnings.warn( "The provided Dataframe is empty! No questions are created.", stacklevel=2 ) return False # Optional check if the correct columns are provided? # Would probably be nice to have that warning here. return True
[docs] def duplicate(self): """ Create a deep copy of the current interview instance. Returns: LLMQuestionnaire: A deep copy of the current object. """ return copy.deepcopy(self)
[docs] def set_base_model_prompt_template( self, template: BaseModelPromptTemplate | None = None, user_prefix: str | None = "User:", assistant_prefix: str | None = "Assistant:", separator: str = "\n", system_prefix: str | None = None, ) -> Self: """Set the template used when rendering prompts for base-model completion mode. Args: template (BaseModelPromptTemplate | None): Existing template object to store. user_prefix (str | None): Prefix placed before each user turn. assistant_prefix (str | None): Prefix placed before assistant turns and final cue. separator (str): Text inserted between rendered conversation blocks. system_prefix (str | None): Optional prefix placed before the system prompt. Returns: LLMPrompt: The current prompt object for fluent configuration. """ if template is not None: self.base_model_prompt_template = template else: self.base_model_prompt_template = BaseModelPromptTemplate( user_prefix=user_prefix, assistant_prefix=assistant_prefix, separator=separator, system_prefix=system_prefix, ) return self
[docs] def render_base_model_prompt( self, system_message: str | None, prompts: list[str], assistant_messages: list[str] | None = None, ) -> str: """Render chat-style turns into the exact prompt used for base-model generation. Args: system_message (str | None): Optional system text to place before the turns. prompts (list[str]): User turns to render. assistant_messages (list[str] | None): Assistant history between user turns. Returns: str: Rendered base-model prompt. """ messages = [] if system_message is not None: messages.append({"role": "system", "content": system_message}) assistant_messages = assistant_messages or [] for index, prompt in enumerate(prompts): messages.append({"role": "user", "content": prompt}) if index < len(assistant_messages): messages.append({"role": "assistant", "content": assistant_messages[index]}) return messages_to_base_model_prompt(messages, self.base_model_prompt_template)
[docs] def get_prompt_for_questionnaire_type( self, questionnaire_type: QuestionnairePresentation = QuestionnairePresentation.SINGLE_ITEM, item_id: str | int | None = None, item_position: int | None = 0, item_separator: str = "\n", inference_type: Literal["chat", "generation"] = "chat", ) -> tuple[str | None, str]: """ Generate the full prompt for a given questionnaire presentation. Args: quesitonnaire_type (QuestionnairePresentation): The type of questionnaire prompt to generate. item_id (str): The id of the questionnaire_item that should be shown. If both item_id and item_position are provided, only item_id is considered. item_position (int): The question at that position will be shown. If both item_id and item_position are provided, only item_id is considered. Defaults to the first question. item_separator (str): For QuestionnairePresentation.BATTERY decides the str that seperates each question. inference_type (str): If "chat", return system and user messages. If "generation", return the exact rendered base-model prompt. Returns: Tuple(str | None, str): The first element corresponds to the system_prompt, the second element to the prompt. """ options = "" automatic_output_instructions = "" question_map = {question.item_id: question for question in self._questions} reference_item_position = item_position if item_id: question_item = question_map[item_id] reference_item_position = next( i for i, question in enumerate(self._questions) if question.item_id == item_id ) elif item_id and item_id not in question_map.keys(): raise ValueError("item_id does not exist.") elif item_position >= len(self._questions): raise ValueError("item_order_id is bigger than the number of questions") else: question_item = self._questions[item_position] if ( questionnaire_type == QuestionnairePresentation.SINGLE_ITEM or questionnaire_type == QuestionnairePresentation.SEQUENTIAL ): question = self.generate_question_prompt(question_item) if question_item.answer_options: options = question_item.answer_options.create_options_str() rgm = question_item.answer_options.response_generation_method if rgm is None: # by default, no response generation method is required automatic_output_instructions = "" else: automatic_output_instructions: str = rgm.get_automatic_prompt() else: options = "" automatic_output_instructions = "" format_dict = { placeholder.PROMPT_QUESTIONS: question, placeholder.PROMPT_OPTIONS: options, placeholder.PROMPT_AUTOMATIC_OUTPUT_INSTRUCTIONS: automatic_output_instructions, } elif questionnaire_type == QuestionnairePresentation.BATTERY: all_questions: list[str] = [] for question in self._questions: current_question_prompt = self.generate_question_prompt(question) if question.answer_options: options = question.answer_options.create_options_str() else: options = "" format_dict = { placeholder.PROMPT_OPTIONS: options, } current_question_prompt = safe_format_with_regex( current_question_prompt, format_dict ) all_questions.append(current_question_prompt) all_questions_str = item_separator.join(all_questions) if question_item.answer_options: options = question_item.answer_options.create_options_str() else: options = "" rgm = resolve_battery_response_generation_method( questions=list(self._questions), item_position=reference_item_position, ) if rgm is None: # by default, no response generation method is required automatic_output_instructions = "" else: automatic_output_instructions = rgm.get_automatic_prompt() format_dict = { placeholder.PROMPT_QUESTIONS: all_questions_str, placeholder.PROMPT_OPTIONS: options, placeholder.PROMPT_AUTOMATIC_OUTPUT_INSTRUCTIONS: automatic_output_instructions, } if self.system_prompt is None: system_prompt = None else: system_prompt = safe_format_with_regex(self.system_prompt, format_dict) prompt = safe_format_with_regex(self.prompt, format_dict) if inference_type == "generation": return None, self.render_base_model_prompt(system_prompt, [prompt]) if inference_type != "chat": raise ValueError("`inference_type` must be either 'chat' or 'generation'.") return system_prompt, prompt
def _get_token_counter( self, model_id: str, tokenizer_backend: Literal["tiktoken", "transformers"], ): if tokenizer_backend == "tiktoken": import tiktoken encoding = tiktoken.encoding_for_model(model_id) def count_tokens(text: str | None) -> int: if text is None: return 0 return len(encoding.encode(text, disallowed_special=())) return count_tokens if tokenizer_backend == "transformers": try: from transformers import AutoTokenizer except ImportError as exc: raise ImportError( "Token estimation with tokenizer_backend='transformers' requires " "the optional 'transformers' package." ) from exc tokenizer = AutoTokenizer.from_pretrained(model_id) def count_tokens(text: str | None) -> int: if text is None: return 0 return len(tokenizer.encode(text, add_special_tokens=False)) return count_tokens raise ValueError("`tokenizer_backend` must be either 'tiktoken' or 'transformers'.") @staticmethod def _count_chat_input_tokens( system_prompt: str | None, prompt: str, count_tokens, tokenizer_backend: Literal["tiktoken", "transformers"], ) -> int: """Count chat message content with a small OpenAI chat wrapper estimate.""" message_count = 1 + (1 if system_prompt is not None else 0) content_tokens = count_tokens(system_prompt) + count_tokens(prompt) if tokenizer_backend == "tiktoken": # OpenAI chat APIs add structural tokens around each message plus a reply cue. return content_tokens + message_count * 3 + 3 return content_tokens
[docs] def calculate_input_token_estimate( self, model_id: str, tokenizer_backend: Literal["tiktoken", "transformers"], questionnaire_type: QuestionnairePresentation = QuestionnairePresentation.SINGLE_ITEM, inference_type: Literal["chat", "generation"] = "chat", item_separator: str = "\n", previous_response_token_estimate: int = 100, ) -> int: """Estimate the largest input-token context for a questionnaire prompt. Args: model_id (str): Model identifier for the selected tokenizer backend. tokenizer_backend (str): Tokenizer backend, either "tiktoken" or "transformers". questionnaire_type (QuestionnairePresentation): Type of questionnaire prompt. inference_type (str): If "chat", count chat message inputs. If "generation", count the rendered base-model prompt. item_separator (str): Separator used between items for battery prompts. previous_response_token_estimate (int): Estimated tokens per previous assistant answer in sequential presentation. Returns: int: Estimated largest input-token context for a single model request. """ if inference_type not in {"chat", "generation"}: raise ValueError("`inference_type` must be either 'chat' or 'generation'.") if previous_response_token_estimate < 0: raise ValueError("`previous_response_token_estimate` must be non-negative.") count_tokens = self._get_token_counter(model_id, tokenizer_backend) def count_prompt(system_prompt: str | None, prompt: str) -> int: if inference_type == "generation": return count_tokens(prompt) return self._count_chat_input_tokens( system_prompt, prompt, count_tokens, tokenizer_backend, ) if questionnaire_type == QuestionnairePresentation.SINGLE_ITEM: return max( count_prompt( *self.get_prompt_for_questionnaire_type( questionnaire_type=QuestionnairePresentation.SINGLE_ITEM, item_position=item_position, inference_type=inference_type, ) ) for item_position in range(len(self._questions)) ) if questionnaire_type == QuestionnairePresentation.BATTERY: return count_prompt( *self.get_prompt_for_questionnaire_type( questionnaire_type=QuestionnairePresentation.BATTERY, item_position=0, item_separator=item_separator, inference_type=inference_type, ) ) if questionnaire_type == QuestionnairePresentation.SEQUENTIAL: prompts: list[str] = [] answer_count = max(len(self._questions) - 1, 0) system_prompt: str | None = None for item_position in range(len(self._questions)): current_system_prompt, current_prompt = self.get_prompt_for_questionnaire_type( questionnaire_type=QuestionnairePresentation.SEQUENTIAL, item_position=item_position, ) system_prompt = current_system_prompt prompts.append(current_prompt) if inference_type == "generation": rendered_prompt = self.render_base_model_prompt(system_prompt, prompts) return count_tokens(rendered_prompt) + ( answer_count * previous_response_token_estimate ) content_tokens = count_tokens(system_prompt) + sum( count_tokens(prompt) for prompt in prompts ) previous_answer_tokens = answer_count * previous_response_token_estimate if tokenizer_backend == "tiktoken": message_count = ( len(prompts) + answer_count + (1 if system_prompt is not None else 0) ) return content_tokens + previous_answer_tokens + message_count * 3 + 3 return content_tokens + previous_answer_tokens raise ValueError(f"Unsupported questionnaire_type: {questionnaire_type}.")
[docs] def get_questions(self) -> tuple[QuestionnaireItem, ...]: """ Get an immutable snapshot of loaded interview questions. Returns: Tuple[QuestionnaireItem, ...]: Loaded questions. """ return tuple(self._questions)
@property def questions(self) -> tuple[QuestionnaireItem, ...]: """Read-only view of questionnaire items.""" return tuple(self._questions)
[docs] def get_question(self, position: int) -> QuestionnaireItem: """Return a question by positional index.""" return self._questions[position]
[docs] def replace_question(self, position: int, questionnaire_item: QuestionnaireItem) -> None: """Replace the question at a given index.""" self._questions[position] = questionnaire_item
[docs] def remove_question(self, position: int) -> None: """Remove the question at a given index.""" del self._questions[position]
[docs] def get_question_item_id(self, position: int) -> Any: """Return the questionnaire item id at a given index.""" return self._questions[position].item_id
[docs] def load_questionnaire_format(self, questionnaire_source: str | pd.DataFrame) -> Self: """Load questionnaire items from a CSV file or pandas DataFrame. The source must include `questionnaire_item_id`. It may also include question text, stems, prefilled responses, answer option columns, Likert generation columns, and simple response-generation presets. List-like columns must contain Python lists or Python-list strings, for example `["No", "Yes"]`. Args: questionnaire_source (str or pd.Dataframe): Path to a CSV file or a DataFrame. Returns: Self: The updated instance with loaded questions. """ questionnaire_questions: list[QuestionnaireItem] = [] # This is a duplicate check with actual Error here, # because if the method is called on its own it should not run the remaining code if not self._check_valid_questionnaire(questionnaire_source=questionnaire_source): raise ValueError("Please provide a non empty DataFrame or a valid String.") if isinstance(questionnaire_source, pd.DataFrame): df = questionnaire_source else: df = pd.read_csv(questionnaire_source) for _, row in df.iterrows(): questionnaire_item_id = row[constants.QUESTIONNAIRE_ITEM_ID] questionnaire_question_content = optional_row_value( row, constants.QUESTION_CONTENT, ) question_stem = optional_row_value(row, constants.QUESTION_STEM) prefilled_response = optional_row_value( row, QuestionnaireLoaderColumn.PREFILLED_RESPONSE, ) answer_options = _build_answer_options_from_row(row, questionnaire_item_id) generated_questionnaire_question = QuestionnaireItem( item_id=questionnaire_item_id, question_content=questionnaire_question_content, question_stem=question_stem, answer_options=answer_options, prefilled_response=prefilled_response, ) questionnaire_questions.append(generated_questionnaire_question) self._questions = questionnaire_questions return self
# TODO Item order could be given by ids @overload def prepare_prompt( self, question_stem: str | None = None, answer_options: AnswerOptions | None = None, prefilled_responses: dict[int, str] | None = None, randomized_item_order: bool = False, ) -> Self: ... @overload def prepare_prompt( self, question_stem: list[str] | None = None, answer_options: dict[str, AnswerOptions] | None = None, prefilled_responses: dict[int, str] | None = None, randomized_item_order: bool = False, ) -> Self: ...
[docs] def prepare_prompt( self, question_stem: str | list[str] | None = None, answer_options: AnswerOptions | dict[str, AnswerOptions] | None = None, prefilled_responses: dict[int, str] | None = None, randomized_item_order: bool = False, ) -> Self: """ Prepare the interview by assigning question stems, answer options, and prefilled responses. Args: question_stem (str or List[str], optional): Single or list of question stems. answer_options (AnswerOptions or Dict[int, AnswerOptions], optional): Answer options for all or per question. prefilled_responses (Dict[int, str], optional): If you provide prefilled responses, they will be used to fill the answers instead of prompting the LLM for that question. randomized_item_order (bool): If True, randomize the order of questions. Returns: Self: The updated instance with prepared questions. """ questionnaire_questions: list[QuestionnaireItem] = self._questions prompt_list = isinstance(question_stem, list) if prompt_list: assert len(question_stem) == len( questionnaire_questions ), "If a list of question stems is given, length of prompt " " and survey questions have to be the same" options_dict = False if isinstance(answer_options, AnswerOptions): # self._same_options = True # unnecessary options_dict = False elif isinstance(answer_options, dict): # self._same_options = False # unnecessary options_dict = True updated_questions: list[QuestionnaireItem] = [] if not prefilled_responses: prefilled_responses = {} # for survey_question in survey_questions: # prefilled_answers[survey_question.question_id] = None if not prompt_list and not options_dict: updated_questions = [] for question in questionnaire_questions: new_questionnaire_question = replace( question, question_stem=(question_stem if question_stem else question.question_stem), answer_options=answer_options, prefilled_response=prefilled_responses.get(question.item_id), ) updated_questions.append(new_questionnaire_question) elif not prompt_list and options_dict: for question in questionnaire_questions: new_questionnaire_question = replace( question, question_stem=(question_stem if question_stem else question.question_stem), answer_options=answer_options.get(question.item_id), prefilled_response=prefilled_responses.get(question.item_id), ) updated_questions.append(new_questionnaire_question) elif prompt_list and not options_dict: for i, question in enumerate(questionnaire_questions): new_questionnaire_question = replace( question, question_stem=(question_stem[i] if question_stem else question.question_stem), answer_options=answer_options, prefilled_response=prefilled_responses.get(question.item_id), ) updated_questions.append(new_questionnaire_question) elif prompt_list and options_dict: for i, question in enumerate(questionnaire_questions): new_questionnaire_question = replace( question, question_stem=(question_stem[i] if question_stem else question.question_stem), answer_options=answer_options.get(question.item_id), prefilled_response=prefilled_responses.get(question.item_id), ) updated_questions.append(new_questionnaire_question) if randomized_item_order: random.shuffle(updated_questions) self._questions = updated_questions return self
[docs] def generate_question_prompt(self, questionnaire_items: QuestionnaireItem) -> str: """ Generate the prompt string for a single interview question. Args: questionnaire_items (InterviewItem): The question to prompt. Returns: str: The formatted prompt for the question. """ if questionnaire_items.question_stem: if placeholder.QUESTION_CONTENT in questionnaire_items.question_stem: format_dict = {placeholder.QUESTION_CONTENT: questionnaire_items.question_content} question_prompt = safe_format_with_regex( questionnaire_items.question_stem, format_dict ) else: question_prompt = f"""{questionnaire_items.question_stem} {questionnaire_items.question_content}""" # noqa: E501 else: question_prompt = f"""{questionnaire_items.question_content}""" if questionnaire_items.answer_options: _options_str = questionnaire_items.answer_options.create_options_str() if _options_str is not None: safe_formatter = {placeholder.PROMPT_OPTIONS: _options_str} question_prompt = safe_format_with_regex(question_prompt, safe_formatter) return question_prompt
def __len__(self) -> int: """ Returns the number of questions in our LLMPrompt. Returns: int: The number of questions. """ return len(self._questions) def __str__(self) -> str: """ Creates a human readable display of the system prompt and prompt in default Battery format. """ name_str: str = f"=== {self.questionnaire_name} ===" sys_prompt, prompt = self.get_prompt_for_questionnaire_type( questionnaire_type=QuestionnairePresentation.BATTERY ) sys_str: str = f"=== SYSTEM_PROMPT ===\n{sys_prompt}" prompt_str: str = f"=== USER_PROMPT_WITH_ALL_QUESTIONS ===\n{prompt}" full_str: str = f"{name_str}\n{sys_str}\n{prompt_str}" return full_str
[docs] def insert_questions( self, items: QuestionnaireItem | list[QuestionnaireItem], position: int = None, ) -> None: """Inserts one or more questions into the questionnaire. Args: items (Union[QuestionnaireItem, List[QuestionnaireItem]]): A single QuestionnaireItem or a list of items to insert. position (int): The index where the questions should be inserted. Default [None] adds them at the end. """ if position is None: position = len(self._questions) if not isinstance(items, (list, tuple)): items = [items] self._questions[position:position] = items
_IDX_TYPES = Literal["char_lower", "char_upper", "integer", "no_index"]
[docs] def generate_likert_options( n: int, answer_texts: list[str] | None, only_from_to_scale: bool = False, random_order: bool = False, reversed_order: bool = False, even_order: bool = False, add_middle_category: bool = False, str_middle_cat: str = "Neutral", add_refusal: bool = False, refusal_code: str = "-99", start_idx: int = 1, list_prompt_template: str = prompt_templates.LIST_OPTIONS_DEFAULT, scale_prompt_template: str = prompt_templates.SCALE_OPTIONS_DEFAULT, index_answer_separator: str = ": ", options_separator: str = ", ", idx_type: _IDX_TYPES = "integer", response_generation_method: ResponseGenerationMethod | None = None, ) -> AnswerOptions: """Generates a set of options and a prompt for a Likert-style scale. This function creates a numeric or alphabetic scale of a specified size (n), optionally attaching textual labels to the scale. It provides extensive control over ordering, formatting, and the final prompt string. Args: n (int): The number of options to generate (e.g., 5 for a 5-point scale). answer_texts (Optional[List[str]]): A list of text labels for each option. Its length must equal `n` if provided. only_from_to_scale (bool, optional): If True, the prompt will only show the min and max of the scale (e.g., "1 to 5"). Defaults to False. random_order (bool, optional): If True, the options are randomized. Defaults to False. reversed_order (bool, optional): If True, the options are in reversed input order. Defaults to False. even_order (bool, optional): If True, options the center option will be removed. E.g., for n=5: 1, 2, 4, 5 add_middle_category (bool, optional): If True, a middle category will be added. The name can be specified, by default it is "Neutral". E.g., for n=4: 1, 2, 3: Neutral, 4, 5 str_middle_cat (str, optional): The label for the middle category if `add_middle_category` is True. Defaults to "Neutral". add_refusal (bool, optional): If True, an additional option for "Don't know / Refuse to answer" will be added. Defaults to False. refusal_code (str, optional): The code assigned to the refusal option if `add_refusal` is True. Defaults to "-99". start_idx (int, optional): The starting index for the scale (usually 0 or 1). Defaults to 1. list_prompt_template (str, optional): The template for prompts that list all options. scale_prompt_template (str, optional): The template for prompts that only show the range. index_answer_separator (str, optional): The string used to separate an index from its text label (e.g., "1: Strongly Agree"). Defaults to ": ". options_separator (str, optional): The string used to separate options when listed in the prompt. Defaults to ", ". idx_type (_IDX_TYPES, optional): The type of index to use: "integer", "upper" (A, B, C), or "lower" (a, b, c). Defaults to "integer". response_generation_method (Optional[ResponseGenerationMethod], optional): An object controlling how the final response object is generated. Defaults to None. Raises: ValueError: If `answer_texts` is provided and its length does not match `n`. Returns: AnswerOptions: An object containing the generated list of option strings and the final formatted prompt ready for display. Example: .. code-block:: python # Generate a classic 5-point "Strongly Disagree" to "Strongly Agree" scale labels = [ "Strongly Disagree", "Disagree", "Neutral", "Agree", "Strongly Agree" ] options = SurveyOptionGenerator.generate_likert_options(n=5, answer_texts=labels) """ if only_from_to_scale: # if len(answer_texts) != 2: # raise ValueError( # "From-To scales require exactly 2 descriptions, but " # f"answer_texts was set to '{answer_texts}'." # ) if idx_type != "integer": raise ValueError( "From-To scales require an integer scale index, but " f"idx_type was set to '{idx_type}'." ) else: if answer_texts: if len(answer_texts) != n: raise ValueError( "answer_texts and n need to be the same length, but " f"answer_texts has length {len(answer_texts)} " f"and n was given as {n}." ) if even_order: if n % 2 == 0: raise ValueError("If you want to turn a scale even, it should be odd before.") middle_index = n // 2 answer_texts = answer_texts[:middle_index] + answer_texts[middle_index + 1 :] n = n - 1 if add_middle_category: if n % 2 != 0: raise ValueError("If you want to add a middle category, it should be even before.") middle_index = n // 2 answer_texts = answer_texts[:middle_index] + [str_middle_cat] + answer_texts[middle_index:] n = n + 1 if random_order: if len(answer_texts) < 2: raise ValueError("There must be at least two answer options to reorder randomly.") random.shuffle(answer_texts) # no assignment needed because shuffles already inplace if reversed_order: if len(answer_texts) < 2: raise ValueError("There must be at least two answer options to reorder in reverse.") answer_texts = answer_texts[::-1] if add_refusal: answer_texts.append("Don't know / Refuse to answer") n += 1 answer_option_indices = [] if idx_type == "no_index": # no index, just the answer options directly answer_option_indices = None elif idx_type == "integer": if add_refusal: # if refusal is added, assign it a common code -99 for i in range(n - 1): answer_code = i + start_idx answer_option_indices.append(str(answer_code)) answer_option_indices.append(refusal_code) # common code for refusal else: for i in range(n): answer_code = i + start_idx answer_option_indices.append(str(answer_code)) else: # TODO @Jens add these to constants.py if idx_type == "char_lower": for i in range(n): answer_option_indices.append(ascii_lowercase[(i + start_idx) % 26]) elif idx_type == "char_upper": for i in range(n): answer_option_indices.append(ascii_uppercase[(i + start_idx) % 26]) answer_texts_object = AnswerTexts( answer_texts=answer_texts, indices=answer_option_indices, index_answer_seperator=index_answer_separator, option_seperators=options_separator, only_scale=only_from_to_scale, ) questionnaire_options = AnswerOptions( answer_texts=answer_texts_object, from_to_scale=only_from_to_scale, list_prompt_template=list_prompt_template, scale_prompt_template=scale_prompt_template, response_generation_method=response_generation_method, ) return questionnaire_options