Inference & Guided Decoding

Inference & Guided Decoding#

Dynamic Pydantic#

qstn.inference.dynamic_pydantic.build_pydantic_model_from_json_object(json_object, model_name='StructuredOutput')[source]#

Parameters:

json_object (JSONObject)
model_name (str)

Return type:

type[BaseModel]

Response Generation Methods#

class qstn.inference.response_generation.ChoiceResponseGenerationMethod(output_template='You only respond with the most probable answer option.', output_index_only=False, constrain_answer_options=True, constrain_output=True)[source]#

Bases: ResponseGenerationMethod

Constrain model output to one of a question’s answer options.

Parameters:

output_template (str)
output_index_only (bool)
constrain_answer_options (bool)
constrain_output (bool)

get_automatic_prompt(questions=())[source]#

Parameters:

self (Self)
questions (list[QuestionnaireItem])

prepare_for_answer_options(options, options_text, prompt_formatter)[source]#

Prepare a copied method for one question’s materialized answer options.

Parameters:

options (list[str])
options_text (str)
prompt_formatter (dict[str, str])

Return type:

ChoiceResponseGenerationMethod

property resolved_choices: list[str] | None#: Choices materialized when this method is attached to AnswerOptions.

Bases: object

Parameters:

enum (list[str] | None)
ge (float | None)
le (float | None)
min_length (int | None)
max_length (int | None)
pattern (str | None)
nullable (bool)

enum: list[str] | None = None#

ge: float | None = None#

le: float | None = None#

max_length: int | None = None#

min_length: int | None = None#

nullable: bool = False#

pattern: str | None = None#

class qstn.inference.response_generation.JSONItem(json_field: 'str', value_type: 'ScalarType' = 'string', explanation: 'str | None' = None, constraints: 'Constraints' = <factory>)[source]#

Bases: object

Parameters:

json_field (str)
value_type (Literal['string', 'float', 'int', 'bool'])
explanation (str | None)
constraints (Constraints)

constraints: Constraints#

copy_with_formatted_strings(prompt_formatter=None, **kwargs)[source]#

Parameters:

prompt_formatter (dict[str, str] | None)
kwargs (Any)

Return type:

JSONItem

explanation: str | None = None#

json_field: str#

to_prompt_obj()[source]#

Return type:: dict[str, Any]

to_prompt_value()[source]#

Return type:: str

value_type: Literal['string', 'float', 'int', 'bool'] = 'string'#

class qstn.inference.response_generation.JSONObject(json_field: 'str | None' = None, explanation: 'str | None' = None, children: 'list[JSONItem | JSONObject]' = <factory>)[source]#

Bases: object

Parameters:

json_field (str | None)
explanation (str | None)
children (list[JSONItem | JSONObject])

children: list[JSONItem | JSONObject]#

copy_with_formatted_strings(prompt_formatter=None, **kwargs)[source]#

Parameters:

prompt_formatter (dict[str, str] | None)
kwargs (Any)

Return type:

JSONObject

explanation: str | None = None#

json_field: str | None = None#

to_prompt_obj()[source]#

Return type:: dict[str, Any]

to_prompt_str()[source]#

Return type:: str

to_prompt_value()[source]#

Return type:: dict[str, Any]

class qstn.inference.response_generation.JSONReasoningResponseGenerationMethod(output_template='You always reason about the possible answer options first.\nYou respond with your reasoning and the most probable answer option in the following JSON format:\n{{JSON_TEMPLATE}}', output_index_only=False, reasoning_field='reasoning', reasoning_explanation='your reasoning about the answer options', answer_field='answer', answer_explanation='{options}', battery_question_key_template='{{QUESTION_CONTENT_PLACEHOLDER}}', constrain_answer_options=True, constrain_output=True)[source]#

Bases: JSONResponseGenerationMethod

Response Generation Method: Structured Outputs with Reasoning

Parameters:

output_template (str)
output_index_only (bool)
reasoning_field (str)
reasoning_explanation (str)
answer_field (str)
answer_explanation (str)
battery_question_key_template (str)
constrain_answer_options (bool)
constrain_output (bool)

class qstn.inference.response_generation.JSONResponseGenerationMethod(json_object, output_template='You only respond in the following JSON format:\n{{JSON_TEMPLATE}}', output_index_only=False, battery_question_key_template='{{QUESTION_CONTENT_PLACEHOLDER}}', constrain_answer_options=True, response_field=None, constrain_output=True)[source]#

Bases: ResponseGenerationMethod

Base class for constraining the model output using a JSON object tree.

Parameters:

json_object (JSONObject)
output_template (str)
output_index_only (bool)
battery_question_key_template (str)
constrain_answer_options (bool)
response_field (str | None)
constrain_output (bool)

get_automatic_prompt(questions=())[source]#

Parameters:

self (Self)
questions (list[QuestionnaireItem])

get_json_prompt(questions=())[source]#

Parameters:

self (Self)
questions (list[QuestionnaireItem])

prepare_for_answer_options(options, options_text, prompt_formatter)[source]#

Prepare a copied method for one question’s materialized answer options.

Parameters:

options (list[str])
options_text (str)
prompt_formatter (dict[str, str])

Return type:

JSONResponseGenerationMethod

render_battery_question_key(question)[source]#

Parameters:: question (QuestionnaireItem)
Return type:: str

class qstn.inference.response_generation.JSONSingleResponseGenerationMethod(output_template='You only respond with the most probable answer option in the following JSON format:\n{{JSON_TEMPLATE}}', output_index_only=False, answer_field='answer', answer_explanation='{options}', battery_question_key_template='{{QUESTION_CONTENT_PLACEHOLDER}}', constrain_answer_options=True, constrain_output=True)[source]#

Bases: JSONResponseGenerationMethod

Response Generation Method: Structured Outputs

Parameters:

output_index_only (bool)
answer_field (str)
answer_explanation (str)
battery_question_key_template (str)
constrain_answer_options (bool)
constrain_output (bool)

class qstn.inference.response_generation.JSONVerbalizedDistribution(output_template='You only respond with a probability for each answer option in the following JSON format:\n{{JSON_TEMPLATE}}', output_index_only=False, option_field_template='{option}', option_explanation_template='probability for: {option}', explanation_prompt_placeholders_first_option_only=True, battery_question_key_template='{{QUESTION_CONTENT_PLACEHOLDER}}', constrain_output=True)[source]#

Bases: JSONResponseGenerationMethod

Response generation method for option-wise probability distributions.

Parameters:

output_template (str)
output_index_only (bool)
option_field_template (str)
option_explanation_template (str)
explanation_prompt_placeholders_first_option_only (bool)
battery_question_key_template (str)
constrain_output (bool)

prepare_for_answer_options(options, options_text, prompt_formatter)[source]#

Prepare a copied method for one question’s materialized answer options.

Parameters:

options (list[str])
options_text (str)
prompt_formatter (dict[str, str])

Return type:

JSONVerbalizedDistribution

set_verbalized_options(options, prompt_formatter=None)[source]#

Materialize one float field per answer option.

Parameters:

options (list[str])
prompt_formatter (dict[str, str] | None)

Return type:

None

verbalized_options: list[str]#

class qstn.inference.response_generation.LogprobResponseGenerationMethod(token_position=0, token_limit=1, top_logprobs=20, ignore_reasoning=True, output_template='You only respond with the most probable answer option.', output_index_only=False, constrain_answer_options=True, constrain_output=True)[source]#

Bases: ResponseGenerationMethod

Base class for constraining the model output by requesting token proabilities

Parameters:

token_position (int)
token_limit (int)
top_logprobs (int)
ignore_reasoning (bool)
output_template (str)
output_index_only (bool)
constrain_answer_options (bool)
constrain_output (bool)

token_position#: Position in output where logprobs are captured; use 0 for first-token probabilities (default)

token_limit#: Number of output tokens to generate; e.g., use 1 for first-token probabilities (default)

top_logprobs#: How many of the logprobs to consider, OpenAI supports at most 20

constrain_answer_options#: If True, restrict output to the attached answer options

ignore_reasoning#: If True, only consider tokens after the reasoning output, i.e., after </think>

system_prompt_template#: Template used for formatting the system prompt, e.g., from ..utilities.prompt_templates

output_index_only#: If True, constrain output to answer option index rather than the full text of each answer option

get_automatic_prompt(questions=())[source]#

Parameters:

self (Self)
questions (list[QuestionnaireItem])

prepare_for_answer_options(options, options_text, prompt_formatter)[source]#

Prepare a copied method for one question’s materialized answer options.

Parameters:

options (list[str])
options_text (str)
prompt_formatter (dict[str, str])

Return type:

LogprobResponseGenerationMethod

property resolved_choices: list[str] | None#: Choices materialized when this method is attached to AnswerOptions.

class qstn.inference.response_generation.ResponseGenerationMethod(constrain_answer_options=True, constrain_output=True)[source]#

Bases: ABC

Abstract base class for constraining model output for closed-ended questions.

Parameters:

constrain_answer_options (bool)
constrain_output (bool)

abstractmethod get_automatic_prompt(questions=())[source]#

Parameters:

self (Self)
questions (list[QuestionnaireItem])

prepare_for_answer_options(options, options_text, prompt_formatter)[source]#

Prepare a copied method for one question’s materialized answer options.

Parameters:

options (list[str])
options_text (str)
prompt_formatter (dict[str, str])

Return type:

ResponseGenerationMethod

qstn.inference.response_generation.constrain_json_response_options(json_object, response_field, options)[source]#

Return a JSON object copy with answer-option enum constraints applied.

Parameters:

json_object (JSONObject)
response_field (str | None)
options (list[str])

Return type:

JSONObject

qstn.inference.response_generation.copy_json_response_generation_method(response_generation_method, json_object=None, prompt_formatter=None, **format_kwargs)[source]#

Copy a JSON response method while optionally replacing or formatting its schema.

Parameters:

response_generation_method (JSONResponseGenerationMethod)
json_object (JSONObject | None)
prompt_formatter (dict[str, str] | None)
format_kwargs (Any)

Return type:

JSONResponseGenerationMethod

qstn.inference.response_generation.get_constrained_choices(response_generation_method)[source]#

Return prepared guided choices or validate a missing questionnaire attachment.

Parameters:: response_generation_method (ChoiceResponseGenerationMethod | LogprobResponseGenerationMethod)
Return type:: list[str] | None

qstn.inference.response_generation.resolve_battery_response_generation_method(questions, item_position=0)[source]#

Resolve the response-generation method to use for battery prompts.

Parameters:

questions (list[QuestionnaireItem])
item_position (int)

Return type:

ResponseGenerationMethod | None

Inference#

qstn.inference.survey_inference.batch_generation(model, system_messages=('You are a helpful assistant.',), prompts=('Hi there! What is your name?',), response_generation_method=None, seed=42, client_model_name=None, api_concurrency=10, print_conversation=False, number_of_printed_conversations=2, print_progress=True, reasoning_start_token='<think>', reasoning_end_token='</think>', space_char='Ġ', inference_mode='chat', **generation_kwargs)[source]#

Generate responses for a batch of prompts.

Handles both vLLM and OpenAI API generation with support for: - Structured output (JSON or choice format) - Conversation printing - Progress tracking - Concurrent API requests

Parameters:

model (LLM or AsyncOpenAI) – vLLM model or AsyncOpenAI client.
system_messages (List(str)) – System prompts for each conversation.
prompts (BatchPromptContent) – User prompts. Each request may be a string or an ordered sequence of string and ImageInput blocks.
( (response_generation_method) – ResponseGenerationMethod or List(ResponseGenerationMethod), optional
) – Configuration for structured output.
seed (int) – Random seed for reproducibility, defaults to 42.
client_model_name (str, optional) – Model name when using OpenAI API.
api_concurrency (int) – Max concurrent API requests when using OpenAI API.
number_if_printed_conversations (int) – How many conversations should be printed. Defaults to 2.
print_conversation (bool) – If True, prints conversations. Defaults to False.
print_progress (bool) – If True, shows progress bar. Defaults to True.
reasoning_start_token (str) – Special token at the beginning of reasoning models’ output. Used for manual parsing if automatic parsing fails.
reasoning_end_token (str) – Special token to separate reasoning from regular model output. Used for manual parsing if automatic parsing fails.
space_token (str) – Special char to encode spaces in tokens (“Ġ” for most byte-pair tokenizers).
inference_mode (str) – Use “chat” for message-based models or “completion” for base-model text generation. Defaults to “chat”.
generation_kwargs (Any) – Additional generation parameters
response_generation_method (ResponseGenerationMethod | list[ResponseGenerationMethod] | None)
number_of_printed_conversations (int)
space_char (str)

Returns:

Generated Response, Logprobs, Reasoning

Return type:

Tuple[List[str], List[str], List[str]]

qstn.inference.survey_inference.batch_turn_by_turn_generation(model, system_messages=('You are a helpful assistant.',), prompts=(('Hi there! What is your name?', 'Interesting'),), assistant_messages=None, response_generation_method=None, seed=42, client_model_name=None, api_concurrency=10, print_conversation=False, number_of_printed_conversations=2, print_progress=True, reasoning_start_token='<think>', reasoning_end_token='</think>', space_char='Ġ', inference_mode='chat', **generation_kwargs)[source]#

Generate responses for multi-turn conversations.

Handles conversations with multiple back-and-forth exchanges between user and assistant. Supports: - Structured output formats - Pre-filled assistant messages - Conversation printing - Progress tracking

Parameters:

model (LLM or AsyncOpenAI) – vLLM model or AsyncOpenAI client.
system_messages (List(str)) – System prompts for each conversation.
prompts (ConversationPromptContent) – User prompts grouped by conversation. Each turn may be a string or an ordered sequence of string and ImageInput blocks.
assistant_messages (List(List(str)), optional) – Prefilled assistant responses. For example, if the first list contains one entry, the first assistant turn is prefilled and not inferred.
( (response_generation_method) – ResponseGenerationMethod or List(ResponseGenerationMethod), optional
) – Configuration for structured output.
seed (int) – Random seed for reproducibility, defaults to 42.
client_model_name (str, optional) – Model name when using OpenAI API.
api_concurrency (int) – Max concurrent API requests when using OpenAI API.
print_conversation (bool) – If True, prints conversations. Defaults to False.
number_of_printed_conversations (int) – How many conversations should be printed. Defaults to 2.
print_progress (bool) – If True, shows progress bar. Defaults to True.
reasoning_start_token (str) – Special token at the beginning of reasoning models’ output. Used for manual parsing if automatic parsing fails.
reasoning_end_token (str) – Special token to separate reasoning from regular model output. Used for manual parsing if automatic parsing fails.
space_token (str) – Special char to encode spaces in tokens (“Ġ” for most byte-pair tokenizers).
inference_mode (str) – Use “chat” for message-based models or “completion” for base-model text generation. Defaults to “chat”.
generation_kwargs – Additional generation parameters.
response_generation_method (ResponseGenerationMethod | list[ResponseGenerationMethod] | None)
space_char (str)

Returns:

Generated Response, Logprobs, Reasoning

Return type:

Tuple[List[str], List[str], List[str]]

qstn.inference.local_inference.default_model_init(model_id, seed=42, **model_keywords)[source]#

Initialize a vLLM model with default settings.

Parameters:

model_id (str) – HuggingFace model identifier
seed (int) – Random seed for reproducibility
**model_keywords – Additional keywords passed to LLM constructor

Returns:

Initialized vLLM model instance

Return type:

LLM

qstn.inference.local_inference.run_vllm_batch(model, system_messages=('You are a helpful assistant.',), prompts=('Hi there! What is your name?',), response_generation_method=None, seed=42, print_progress=True, reasoning_start_token='<think>', reasoning_end_token='</think>', space_char='Ġ', inference_mode='chat', **generation_kwargs)[source]#

Parameters:

model (vllm.LLM)
system_messages (Sequence[str | None] | None)
prompts (BatchPromptContent)
response_generation_method (ResponseGenerationMethod | list[ResponseGenerationMethod] | None)
seed (int)
print_progress (bool)
reasoning_start_token (str)
reasoning_end_token (str)
space_char (str)
inference_mode (Literal['chat', 'completion'])
generation_kwargs (Any)

Return type:

tuple[list[str], list[str], list[str]]

qstn.inference.local_inference.run_vllm_batch_conversation(model, system_messages=('You are a helpful assistant.',), prompts=(('Hi there! What is your name?',),), assistant_messages=(), response_generation_method=None, seed=42, print_progress=True, reasoning_start_token='<think>', reasoning_end_token='</think>', space_char='Ġ', inference_mode='chat', **generation_kwargs)[source]#

Parameters:

model (vllm.LLM)
system_messages (Sequence[str | None] | None)
prompts (ConversationPromptContent)
assistant_messages (Sequence[Sequence[str]])
response_generation_method (ResponseGenerationMethod | list[ResponseGenerationMethod] | None)
seed (int)
print_progress (bool)
reasoning_start_token (str)
reasoning_end_token (str)
space_char (str)
inference_mode (Literal['chat', 'completion'])
generation_kwargs (Any)

Return type:

tuple[list[str], list[str], list[str]]

qstn.inference.remote_inference.run_openai_batch(model, system_messages=('You are a helpful assistant.',), prompts=('Hi there! What is your name?',), response_generation_method=None, seed=42, client_model_name=None, api_concurrency=10, reasoning_start_token='<think>', reasoning_end_token='</think>', print_progress=True, inference_mode='chat', **generation_kwargs)[source]#

Parameters:

model (AsyncOpenAI)
system_messages (list[str | None] | None)
prompts (BatchPromptContent)
response_generation_method (ResponseGenerationMethod | list[ResponseGenerationMethod] | None)
seed (int)
client_model_name (str | None)
api_concurrency (int)
reasoning_start_token (str)
reasoning_end_token (str)
print_progress (bool)
inference_mode (Literal['chat', 'completion'])
generation_kwargs (Any)

Return type:

tuple[list[str], list[str], list[str]]

qstn.inference.remote_inference.run_openai_batch_conversation(model, system_messages=('You are a helpful assistant.',), prompts=(('Hi there! What is your name?',),), assistant_messages=None, response_generation_method=None, seed=42, client_model_name=None, api_concurrency=10, reasoning_start_token='<think>', reasoning_end_token='</think>', print_progress=True, inference_mode='chat', **generation_kwargs)[source]#

Parameters:

model (AsyncOpenAI)
system_messages (list[str | None] | None)
prompts (ConversationPromptContent)
assistant_messages (list[list[str]] | None)
response_generation_method (ResponseGenerationMethod | list[ResponseGenerationMethod] | None)
seed (int)
client_model_name (str | None)
api_concurrency (int)
reasoning_start_token (str)
reasoning_end_token (str)
print_progress (bool)
inference_mode (Literal['chat', 'completion'])
generation_kwargs (Any)

Return type:

tuple[list[str], list[str], list[str]]

Inference & Guided Decoding

Contents

Inference & Guided Decoding#

Dynamic Pydantic#

Response Generation Methods#

Inference#