Inference & Guided Decoding#
Dynamic Pydantic#
- qstn.inference.dynamic_pydantic.build_pydantic_model_from_json_object(json_object, model_name='StructuredOutput')[source]#
- Parameters:
json_object (JSONObject)
model_name (str)
- Return type:
type[BaseModel]
Response Generation Methods#
- class qstn.inference.response_generation.ChoiceResponseGenerationMethod(allowed_choices=None, allowed_choices_template=None, output_template='You only respond with the most probable answer option.', output_index_only=False)[source]#
Bases:
ResponseGenerationMethodBase class for constraining the model output using a Choice between answer options
- Parameters:
allowed_choices (list[str] | None)
allowed_choices_template (str | None)
output_template (str)
output_index_only (bool)
- allowed_choices#
List of allowed choices for choice output
- system_prompt_template#
Template used for formatting the system prompt, e.g., from ..utilities.prompt_templates
- output_index_only#
If True, constrain output to answer option index rather than the full text of each answer option
- get_automatic_prompt(questions=())[source]#
- Parameters:
self (Self)
questions (list[QuestionnaireItem])
- class qstn.inference.response_generation.Constraints(enum: 'list[str] | None' = None, ge: 'float | None' = None, le: 'float | None' = None, min_length: 'int | None' = None, max_length: 'int | None' = None, pattern: 'str | None' = None, nullable: 'bool' = False)[source]#
Bases:
object- Parameters:
enum (list[str] | None)
ge (float | None)
le (float | None)
min_length (int | None)
max_length (int | None)
pattern (str | None)
nullable (bool)
- enum: list[str] | None = None#
- ge: float | None = None#
- le: float | None = None#
- max_length: int | None = None#
- min_length: int | None = None#
- nullable: bool = False#
- pattern: str | None = None#
- class qstn.inference.response_generation.JSONItem(json_field: 'str', value_type: 'ScalarType' = 'string', explanation: 'str | None' = None, constraints: 'Constraints' = <factory>)[source]#
Bases:
object- Parameters:
json_field (str)
value_type (Literal['string', 'float', 'int', 'bool'])
explanation (str | None)
constraints (Constraints)
- constraints: Constraints#
- copy_with_formatted_strings(prompt_formatter=None, **kwargs)[source]#
- Parameters:
prompt_formatter (dict[str, str] | None)
kwargs (Any)
- Return type:
- explanation: str | None = None#
- json_field: str#
- value_type: Literal['string', 'float', 'int', 'bool'] = 'string'#
- class qstn.inference.response_generation.JSONObject(json_field: 'str | None' = None, explanation: 'str | None' = None, children: 'list[JSONItem | JSONObject]' = <factory>)[source]#
Bases:
object- Parameters:
json_field (str | None)
explanation (str | None)
children (list[JSONItem | JSONObject])
- children: list[JSONItem | JSONObject]#
- copy_with_formatted_strings(prompt_formatter=None, **kwargs)[source]#
- Parameters:
prompt_formatter (dict[str, str] | None)
kwargs (Any)
- Return type:
- explanation: str | None = None#
- json_field: str | None = None#
- class qstn.inference.response_generation.JSONReasoningResponseGenerationMethod(output_template='You always reason about the possible answer options first.\nYou respond with your reasoning and the most probable answer option in the following JSON format:\n{{JSON_TEMPLATE}}', output_index_only=False, reasoning_field='reasoning', reasoning_explanation='your reasoning about the answer options', answer_field='answer', answer_explanation='choose one of: {options}', battery_question_key_template='{{QUESTION_CONTENT_PLACEHOLDER}}', constrain_answer_options=True)[source]#
Bases:
JSONResponseGenerationMethodResponse Generation Method: Structured Outputs with Reasoning
- Parameters:
output_template (str)
output_index_only (bool)
reasoning_field (str)
reasoning_explanation (str)
answer_field (str)
answer_explanation (str)
battery_question_key_template (str)
constrain_answer_options (bool)
- class qstn.inference.response_generation.JSONResponseGenerationMethod(json_object, output_template='You only respond in the following JSON format:\n{{JSON_TEMPLATE}}', output_index_only=False, battery_question_key_template='{{QUESTION_CONTENT_PLACEHOLDER}}', constrain_answer_options=True, response_field=None)[source]#
Bases:
ResponseGenerationMethodBase class for constraining the model output using a JSON object tree.
- Parameters:
json_object (JSONObject)
output_template (str)
output_index_only (bool)
battery_question_key_template (str)
constrain_answer_options (bool)
response_field (str | None)
- get_automatic_prompt(questions=())[source]#
- Parameters:
self (Self)
questions (list[QuestionnaireItem])
- get_json_prompt(questions=())[source]#
- Parameters:
self (Self)
questions (list[QuestionnaireItem])
- render_battery_question_key(question)[source]#
- Parameters:
question (QuestionnaireItem)
- Return type:
str
- class qstn.inference.response_generation.JSONSingleResponseGenerationMethod(output_template='You only respond with the most probable answer option in the following JSON format:\n{{JSON_TEMPLATE}}', output_index_only=False, answer_field='answer', answer_explanation='choose one of: {options}', battery_question_key_template='{{QUESTION_CONTENT_PLACEHOLDER}}', constrain_answer_options=True)[source]#
Bases:
JSONResponseGenerationMethodResponse Generation Method: Structured Outputs
- Parameters:
output_index_only (bool)
answer_field (str)
answer_explanation (str)
battery_question_key_template (str)
constrain_answer_options (bool)
- class qstn.inference.response_generation.JSONVerbalizedDistribution(output_template='You only respond with a probability for each answer option in the following JSON format:\n{{JSON_TEMPLATE}}', output_index_only=False, option_field_template='{option}', option_explanation_template='probability for: {option}', explanation_prompt_placeholders_first_option_only=True, battery_question_key_template='{{QUESTION_CONTENT_PLACEHOLDER}}')[source]#
Bases:
JSONResponseGenerationMethodResponse generation method for option-wise probability distributions.
- Parameters:
output_template (str)
output_index_only (bool)
option_field_template (str)
option_explanation_template (str)
explanation_prompt_placeholders_first_option_only (bool)
battery_question_key_template (str)
- set_verbalized_options(options, prompt_formatter=None)[source]#
Materialize one float field per answer option.
- Parameters:
options (list[str])
prompt_formatter (dict[str, str] | None)
- Return type:
None
- verbalized_options: list[str]#
- class qstn.inference.response_generation.LogprobResponseGenerationMethod(token_position=0, token_limit=1, top_logprobs=20, allowed_choices=None, allowed_choices_template=None, ignore_reasoning=True, output_template='You only respond with the most probable answer option.', output_index_only=False)[source]#
Bases:
ResponseGenerationMethodBase class for constraining the model output by requesting token proabilities
- Parameters:
token_position (int)
token_limit (int)
top_logprobs (int)
allowed_choices (list[str] | None)
allowed_choices_template (str | None)
ignore_reasoning (bool)
output_template (str)
output_index_only (bool)
- token_position#
Position in output where logprobs are captured; use 0 for first-token probabilities (default)
- token_limit#
Number of output tokens to generate; e.g., use 1 for first-token probabilities (default)
- top_logprobs#
How many of the logprobs to consider, OpenAI supports at most 20
- allowed_choices#
If not None, restrict output additionally with guided_choice
- ignore_reasoning#
If True, only consider tokens after the reasoning output, i.e., after </think>
- system_prompt_template#
Template used for formatting the system prompt, e.g., from ..utilities.prompt_templates
- output_index_only#
If True, constrain output to answer option index rather than the full text of each answer option
- get_automatic_prompt(questions=())[source]#
- Parameters:
self (Self)
questions (list[QuestionnaireItem])
- class qstn.inference.response_generation.ResponseGenerationMethod[source]#
Bases:
ABCAbstract base class for constraining model output for closed-ended questions.
- abstractmethod get_automatic_prompt(questions=())[source]#
- Parameters:
self (Self)
questions (list[QuestionnaireItem])
- qstn.inference.response_generation.constrain_json_response_options(json_object, response_field, options)[source]#
Return a JSON object copy with answer-option enum constraints applied.
- Parameters:
json_object (JSONObject)
response_field (str | None)
options (list[str])
- Return type:
- qstn.inference.response_generation.copy_json_response_generation_method(response_generation_method, json_object=None, prompt_formatter=None, **format_kwargs)[source]#
- Parameters:
response_generation_method (JSONResponseGenerationMethod)
json_object (JSONObject | None)
prompt_formatter (dict[str, str] | None)
format_kwargs (Any)
- Return type:
- qstn.inference.response_generation.resolve_battery_response_generation_method(questions, item_position=0)[source]#
Resolve the response-generation method to use for battery prompts.
- Parameters:
questions (list[QuestionnaireItem])
item_position (int)
- Return type:
ResponseGenerationMethod | None
Inference#
- qstn.inference.survey_inference.batch_generation(model, system_messages=('You are a helpful assistant.',), prompts=('Hi there! What is your name?',), response_generation_method=None, seed=42, client_model_name=None, api_concurrency=10, print_conversation=False, number_of_printed_conversations=2, print_progress=True, reasoning_start_token='<think>', reasoning_end_token='</think>', space_char='Ġ', inference_mode='chat', **generation_kwargs)[source]#
Generate responses for a batch of prompts.
Handles both vLLM and OpenAI API generation with support for: - Structured output (JSON or choice format) - Conversation printing - Progress tracking - Concurrent API requests
- Parameters:
model (LLM or AsyncOpenAI) – vLLM model or AsyncOpenAI client.
system_messages (List(str)) – System prompts for each conversation.
prompts (List(str)) – User prompts to generate responses for.
( (response_generation_method) – ResponseGenerationMethod or List(ResponseGenerationMethod), optional
) – Configuration for structured output.
seed (int) – Random seed for reproducibility, defaults to 42.
client_model_name (str, optional) – Model name when using OpenAI API.
api_concurrency (int) – Max concurrent API requests when using OpenAI API.
number_if_printed_conversations (int) – How many conversations should be printed. Defaults to 2.
print_conversation (bool) – If True, prints conversations. Defaults to False.
print_progress (bool) – If True, shows progress bar. Defaults to True.
reasoning_start_token (str) – Special token at the beginning of reasoning models’ output. Used for manual parsing if automatic parsing fails.
reasoning_end_token (str) – Special token to separate reasoning from regular model output. Used for manual parsing if automatic parsing fails.
space_token (str) – Special char to encode spaces in tokens (“Ġ” for most byte-pair tokenizers).
inference_mode (str) – Use “chat” for message-based models or “completion” for base-model text generation. Defaults to “chat”.
generation_kwargs (Any) – Additional generation parameters
response_generation_method (ResponseGenerationMethod | list[ResponseGenerationMethod] | None)
number_of_printed_conversations (int)
space_char (str)
- Returns:
Generated Response, Logprobs, Reasoning
- Return type:
Tuple[List[str], List[str], List[str]]
- qstn.inference.survey_inference.batch_turn_by_turn_generation(model, system_messages=('You are a helpful assistant.',), prompts=(('Hi there! What is your name?', 'Interesting'),), assistant_messages=None, response_generation_method=None, seed=42, client_model_name=None, api_concurrency=10, print_conversation=False, number_of_printed_conversations=2, print_progress=True, reasoning_start_token='<think>', reasoning_end_token='</think>', space_char='Ġ', inference_mode='chat', **generation_kwargs)[source]#
Generate responses for multi-turn conversations.
Handles conversations with multiple back-and-forth exchanges between user and assistant. Supports: - Structured output formats - Pre-filled assistant messages - Conversation printing - Progress tracking
- Parameters:
model (LLM or AsyncOpenAI) – vLLM model or AsyncOpenAI client.
system_messages (List(str)) – System prompts for each conversation.
prompts (List(List(str))) – User prompts to generate responses for. Can include multiple requests per system prompt.
assistant_messages (List(List(str)), optional) – Prefilled assistant responses. For example, if the first list contains one entry, the first assistant turn is prefilled and not inferred.
( (response_generation_method) – ResponseGenerationMethod or List(ResponseGenerationMethod), optional
) – Configuration for structured output.
seed (int) – Random seed for reproducibility, defaults to 42.
client_model_name (str, optional) – Model name when using OpenAI API.
api_concurrency (int) – Max concurrent API requests when using OpenAI API.
print_conversation (bool) – If True, prints conversations. Defaults to False.
number_of_printed_conversations (int) – How many conversations should be printed. Defaults to 2.
print_progress (bool) – If True, shows progress bar. Defaults to True.
reasoning_start_token (str) – Special token at the beginning of reasoning models’ output. Used for manual parsing if automatic parsing fails.
reasoning_end_token (str) – Special token to separate reasoning from regular model output. Used for manual parsing if automatic parsing fails.
space_token (str) – Special char to encode spaces in tokens (“Ġ” for most byte-pair tokenizers).
inference_mode (str) – Use “chat” for message-based models or “completion” for base-model text generation. Defaults to “chat”.
generation_kwargs – Additional generation parameters.
response_generation_method (ResponseGenerationMethod | list[ResponseGenerationMethod] | None)
space_char (str)
- Returns:
Generated Response, Logprobs, Reasoning
- Return type:
Tuple[List[str], List[str], List[str]]
- qstn.inference.local_inference.default_model_init(model_id, seed=42, **model_keywords)[source]#
Initialize a vLLM model with default settings.
- Parameters:
model_id (str) – HuggingFace model identifier
seed (int) – Random seed for reproducibility
**model_keywords – Additional keywords passed to LLM constructor
- Returns:
Initialized vLLM model instance
- Return type:
LLM
- qstn.inference.local_inference.run_vllm_batch(model, system_messages=('You are a helpful assistant.',), prompts=('Hi there! What is your name?',), response_generation_method=None, seed=42, print_progress=True, reasoning_start_token='<think>', reasoning_end_token='</think>', space_char='Ġ', inference_mode='chat', **generation_kwargs)[source]#
- Parameters:
model (vllm.LLM)
system_messages (Sequence[str | None] | None)
prompts (Sequence[str])
response_generation_method (ResponseGenerationMethod | list[ResponseGenerationMethod] | None)
seed (int)
print_progress (bool)
reasoning_start_token (str)
reasoning_end_token (str)
space_char (str)
inference_mode (Literal['chat', 'completion'])
generation_kwargs (Any)
- Return type:
tuple[list[str], list[str], list[str]]
- qstn.inference.local_inference.run_vllm_batch_conversation(model, system_messages=('You are a helpful assistant.',), prompts=(('Hi there! What is your name?',),), assistant_messages=(), response_generation_method=None, seed=42, print_progress=True, reasoning_start_token='<think>', reasoning_end_token='</think>', space_char='Ġ', inference_mode='chat', **generation_kwargs)[source]#
- Parameters:
model (vllm.LLM)
system_messages (Sequence[str | None] | None)
prompts (Sequence[Sequence[str]])
assistant_messages (Sequence[Sequence[str]])
response_generation_method (ResponseGenerationMethod | list[ResponseGenerationMethod] | None)
seed (int)
print_progress (bool)
reasoning_start_token (str)
reasoning_end_token (str)
space_char (str)
inference_mode (Literal['chat', 'completion'])
generation_kwargs (Any)
- Return type:
tuple[list[str], list[str], list[str]]
- qstn.inference.remote_inference.run_openai_batch(model, system_messages=('You are a helpful assistant.',), prompts=('Hi there! What is your name?',), response_generation_method=None, seed=42, client_model_name=None, api_concurrency=10, reasoning_start_token='<think>', reasoning_end_token='</think>', print_progress=True, inference_mode='chat', **generation_kwargs)[source]#
- Parameters:
model (AsyncOpenAI)
system_messages (list[str | None] | None)
prompts (list[str])
response_generation_method (ResponseGenerationMethod | list[ResponseGenerationMethod] | None)
seed (int)
client_model_name (str | None)
api_concurrency (int)
reasoning_start_token (str)
reasoning_end_token (str)
print_progress (bool)
inference_mode (Literal['chat', 'completion'])
generation_kwargs (Any)
- Return type:
tuple[list[str], list[str], list[str]]
- qstn.inference.remote_inference.run_openai_batch_conversation(model, system_messages=('You are a helpful assistant.',), prompts=(('Hi there! What is your name?',),), assistant_messages=None, response_generation_method=None, seed=42, client_model_name=None, api_concurrency=10, reasoning_start_token='<think>', reasoning_end_token='</think>', print_progress=True, inference_mode='chat', **generation_kwargs)[source]#
- Parameters:
model (AsyncOpenAI)
system_messages (list[str | None] | None)
prompts (list[list[str]])
assistant_messages (list[list[str]] | None)
response_generation_method (ResponseGenerationMethod | list[ResponseGenerationMethod] | None)
seed (int)
client_model_name (str | None)
api_concurrency (int)
reasoning_start_token (str)
reasoning_end_token (str)
print_progress (bool)
inference_mode (Literal['chat', 'completion'])
generation_kwargs (Any)
- Return type:
tuple[list[str], list[str], list[str]]