Prompt Iteration Walkthrough
This notebook performs a quick walkthrough of LangSmith's evaluation flow, introducing:
- Datasets & Evaluation
- Summary evaluators (for aggregate statistics)
- Prompt Versioning in the hub.
- Using the LLM proxy.
Importantly, the pipelines in this walkthrough do not depend on the LangChain open source libraries. LangChain is only used for in part 3 to show how to use one of its many off-the-shelf evaluators or an din part 4 to connect to the prompt Hub.
Setup
First, we'll do some setup. Create a LangSmith API Key by navigating to the settings page in LangSmith, then set the following environment variables.
OPENAI_API_KEY=<YOUR OPENAI API KEY>
LANGCHAIN_TRACING_V2=true
LANGCHAIN_PROJECT=<YOUR PROJECT NAME>
LANGCHAIN_API_KEY=<YOUR LANGSMITH API KEY>
from langsmith import Client
client = Client()
Pt. 1 -- Toxic Queries
toxic_examples = [
("Shut up, idiot", "Toxic"),
("You're a wonderful person", "Not toxic"),
("This is the worst thing ever", "Toxic"),
("I had a great day today", "Not toxic"),
("Nobody likes you", "Toxic"),
("This movie is a masterpiece", "Not toxic"),
("Go away and never come back", "Toxic"),
("Thank you for your help", "Not toxic"),
("This is so dumb", "Toxic"),
("I appreciate your efforts", "Not toxic"),
("This is a waste of time", "Toxic"),
("This movie blows", "Toxic"),
("This is unacceptable. I want to speak to the manager.", "Toxic"),
]
toxic_dataset_name = "Toxic Queries"
if not client.has_dataset(dataset_name=toxic_dataset_name):
toxic_dataset = client.create_dataset(dataset_name=toxic_dataset_name)
inputs, outputs = zip(
*[({"text": text}, {"label": label}) for text, label in toxic_examples]
)
client.create_examples(inputs=inputs, outputs=outputs, dataset_id=toxic_dataset.id)
Our Pipeline
from langsmith import traceable, wrappers
from openai import Client
openai = wrappers.wrap_openai(Client())
@traceable
def label_query(text):
messages = [
{
"role": "system",
"content": "Given the following user query, assess whether it contains toxic content. Please provide a simple 'Toxic' or 'Not toxic' response based on your assessment.",
},
{"role": "user", "content": text},
]
result = openai.chat.completions.create(
messages=messages, model="gpt-3.5-turbo", temperature=0
)
return result.choices[0].message.content
@traceable
def label_query_alternate_prompt(text):
messages = [
{
"role": "system",
"content": "Please review the user query below and determine if it contains any form of toxic behavior, such as insults, threats, or highly negative comments. Respond with 'Toxic' if it does, and 'Not toxic' if it doesn't.",
},
{"role": "user", "content": text},
]
result = openai.chat.completions.create(
messages=messages, model="gpt-3.5-turbo", temperature=0
)
return result.choices[0].message.content
Evaluate
from langsmith.evaluation import evaluate
# Row-level evaluator
def correct_label(run, example) -> dict:
score = run.outputs.get("output") == example.outputs.get("label")
return {"score": int(score)}
# Summary (experiment-level) evaluator
def summary_eval(runs, examples):
correct = 0
for i, run in enumerate(runs):
if run.outputs["output"] == examples[i].outputs["label"]:
correct += 1
if correct / len(runs) > 0.5:
return {"key": "pass", "score": True}
else:
return {"key": "pass", "score": False}
results_1 = evaluate(
lambda inputs: label_query(inputs["text"]),
data=toxic_dataset_name,
evaluators=[correct_label],
summary_evaluators=[summary_eval],
experiment_prefix="Toxic Queries",
metadata={
"prompt_version": "1",
},
)
/var/folders/gf/6rnp_mbx5914kx7qmmh7xzmw0000gn/T/ipykernel_37884/109358324.py:22: UserWarning: Function evaluate is in beta.
results_1 = evaluate(
View the evaluation results for experiment: 'Toxic Queries:683e040' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/dc6cc406-d05e-4e57-9c47-5868523f5a98/compare?selectedSessions=4fa33927-71dd-4edc-88bd-bac87df4d346
0it [00:00, ?it/s]
results_2 = evaluate(
lambda inputs: label_query_alternate_prompt(inputs["text"]),
data=toxic_dataset_name,
evaluators=[correct_label],
summary_evaluators=[summary_eval],
experiment_prefix="Toxic Queries",
metadata={
"prompt_version": "2",
},
)
/var/folders/gf/6rnp_mbx5914kx7qmmh7xzmw0000gn/T/ipykernel_37884/2093667517.py:1: UserWarning: Function evaluate is in beta.
results_2 = evaluate(
View the evaluation results for experiment: 'Toxic Queries:fb8f77e' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/dc6cc406-d05e-4e57-9c47-5868523f5a98/compare?selectedSessions=4588ca56-f3d5-4384-8a9c-680510d60142
0it [00:00, ?it/s]
Aside: Using the LangSmith Hub for Prompt Management
from langchain import hub
from langchain_openai.chat_models.base import _convert_message_to_dict
HUB_COMMIT_HASH = "8d80588e"
obj = hub.pull(f"langchain-ai/movie-demo:{HUB_COMMIT_HASH}")
hub_messages = [
_convert_message_to_dict(message.format()) for message in obj.messages[:1]
]
@traceable
def label_query_hub(text):
messages = hub_messages + [{"role": "user", "content": text}]
result = openai.chat.completions.create(
messages=messages, model="gpt-3.5-turbo", temperature=0
)
return result.choices[0].message.content
results = evaluate(
lambda inputs: label_query_hub(inputs["text"]),
data=toxic_dataset_name,
evaluators=[correct_label],
summary_evaluators=[summary_eval],
experiment_prefix=f"Toxic Queries prompt @{HUB_COMMIT_HASH}",
metadata={
"prompt_version": HUB_COMMIT_HASH,
},
)
/var/folders/gf/6rnp_mbx5914kx7qmmh7xzmw0000gn/T/ipykernel_37884/1549686522.py:20: UserWarning: Function evaluate is in beta.
results = evaluate(
View the evaluation results for experiment: 'Toxic Queries prompt @8d80588e:b495152' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/dc6cc406-d05e-4e57-9c47-5868523f5a98/compare?selectedSessions=7b346b83-e094-414d-860b-8c4c0fb23714
0it [00:00, ?it/s]
Pt. 2 -- Multi-Turn Queries
# Define multi-turn examples
multi_turn_examples = [
(
[
"Recommend some family-friendly movies for tonight",
"Do any of these have an educational theme?",
"Which one has the highest ratings?",
],
[
"Some family-friendly movies available are 'The Lion King', 'Finding Nemo', and 'The Incredibles'",
"'The Lion King' and 'Finding Nemo' have educational themes about the circle of life and the importance of family",
"'The Incredibles' has the highest ratings among them with a 94% on Rotten Tomatoes",
],
),
(
[
"What are the top sci-fi movies on your service?",
"Any recent ones?",
"Can you suggest one that involves time travel?",
],
[
"Top sci-fi movies include 'Blade Runner 2049', 'Interstellar', and 'The Martian'",
"A recent hit is 'Tenet', released in 2020",
"'Interstellar' involves complex time travel themes and is highly recommended",
],
),
(
[
"I'm looking for movies directed by Christopher Nolan",
"Which one would you recommend for a movie night?",
"What's the plot of 'Inception'?",
],
[
"Christopher Nolan movies available include 'Inception', 'Dunkirk', and 'Interstellar'",
"'Inception' is a great pick for a movie night, offering a mix of action, drama, and mind-bending storytelling",
"'Inception' is about a thief who steals corporate secrets through dream-sharing technology and is given the inverse task of planting an idea into the mind of a CEO",
],
),
(
[
"Show me some popular romantic comedies",
"Any classics in the list?",
"Tell me more about 'When Harry Met Sally'",
],
[
"Popular romantic comedies include 'Crazy Rich Asians', 'The Big Sick', and 'When Harry Met Sally'",
"'When Harry Met Sally' is considered a classic in the romantic comedy genre",
"'When Harry Met Sally' explores the question of whether men and women can just be friends, through the story of its titular characters over the years",
],
),
(
[
"Do you have documentaries on nature?",
"Which one focuses on marine life?",
"How long is 'Blue Planet II'?",
],
[
"Yes, we have 'Planet Earth II', 'Blue Planet II', and 'Our Planet'",
"'Blue Planet II' focuses extensively on marine life, exploring the deep ocean, coral reefs, and the open sea",
"'Blue Planet II' is approximately 7 hours long, spread across 7 episodes",
],
),
]
multi_turn_dataset_name = "Multi-Turn Queries"
if not client.has_dataset(dataset_name=multi_turn_dataset_name):
multi_turn_dataset = client.create_dataset(dataset_name=multi_turn_dataset_name)
multi_turn_inputs, multi_turn_outputs = zip(
*[
({"queries": queries}, {"answers": answers})
for queries, answers in multi_turn_examples
]
)
client.create_examples(
inputs=multi_turn_inputs,
outputs=multi_turn_outputs,
dataset_id=multi_turn_dataset.id,
)
RAG Pipeline
import json
tools = [
{
"type": "function",
"function": {
"name": "retrieve_movies",
"description": "Retrieve a list of relevant movies and their metadata from a movie database.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "The query used to retrieve movies from the movie database, for example 'Christopher Nolan films'",
},
},
"required": ["query"],
},
},
},
]
system_prompt = """
Don't make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous.
Note that if the question does not require additional search and can be answered using the chat history, simply respond with the answer.
Don't make up content that's not supplied in chat history.
"""
@traceable
def generate_movie_search(chat_history, query):
messages = (
[
{"role": "system", "content": system_prompt},
]
+ chat_history
+ [{"role": "user", "content": query}]
)
result = openai.chat.completions.create(
messages=messages, model="gpt-3.5-turbo-0613", tools=tools
)
return result.choices[0].message
def _convert_docs(results):
return [
{
"page_content": r,
"type": "Document",
}
for r in results
]
@traceable(run_type="retriever")
def retrieve_movies(query):
# Foo retriever. In production, this would search an actual database
if "family-friendly" in query.lower():
return _convert_docs(["Lion King", "Finding Nemo", "The Incredibles"])
elif "sci-fi" in query.lower():
return _convert_docs(["Blade Runner 2049", "Interstellar", "The Martian"])
elif "nature" in query.lower():
return _convert_docs(["Planet Earth II", "Blue Planet II", "Our Planet"])
elif "christopher nolan" in query.lower():
return _convert_docs(["Inception", "Dunkirk", "Interstellar"])
else:
return _convert_docs(
["Crazy Rich Asians", "The Big Sick", "When Harry Met Sally"]
)
@traceable
def execute_function_call(message):
if message.tool_calls[0].function.name == "retrieve_movies":
query = json.loads(message.tool_calls[0].function.arguments)["query"]
results = retrieve_movies(query)
else:
results = (
f"Error: function {message.tool_calls[0].function.name} does not exist"
)
return results
@traceable
def generate_answer(question, context):
messages = [
{
"role": "system",
"content": f"Answer the user's question based only on the content below:\n\n{context}",
},
{"role": "user", "content": question},
]
result = openai.chat.completions.create(
messages=messages, model="gpt-3.5-turbo", temperature=0
)
return result.choices[0].message.content
@traceable
def rag_pipeline(chat_history, question):
message = generate_movie_search(chat_history, question)
if message.tool_calls is None:
return message.content
else:
docs = execute_function_call(message)
context = "\n".join([doc["page_content"] for doc in docs])
return generate_answer(question, context)
@traceable
def run_multi_turn(queries):
turns = queries
chat_history, outputs = [], []
for turn in turns:
output = rag_pipeline(chat_history, turn)
chat_history.append({"role": "user", "content": turn})
chat_history.append({"role": "assistant", "content": output})
outputs.append(output)
return outputs
Evaluate
def brevity(run, example) -> dict:
convo = run.outputs.get("output")
for turn in convo:
if len(turn) > 200:
return {"score": 0}
return {"score": 1}
results = evaluate(
lambda inputs: run_multi_turn(inputs["queries"]),
data=multi_turn_dataset_name,
evaluators=[brevity],
experiment_prefix=f"Multi-turn eval",
metadata={
"model": "gpt-3.5-turbo",
"prompt_version": "003",
},
)
/var/folders/gf/6rnp_mbx5914kx7qmmh7xzmw0000gn/T/ipykernel_37884/3314201279.py:9: UserWarning: Function evaluate is in beta.
results = evaluate(
View the evaluation results for experiment: 'Multi-turn eval:c9eee6b' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/3d3915cf-cc7b-4bf9-a583-32ff5e60069d/compare?selectedSessions=50f97dc4-5902-40d1-a071-2e77d06227de
0it [00:00, ?it/s]
Pt. 3 -- Structured Inputs
Create dataset
structured_input_examples = [
(
{
"user_preferences": ["Sci-Fi", "Action"],
"watch_history": ["The Matrix", "Inception"],
"search_query": "What to watch next?",
},
"Based on your love for Sci-Fi and Action movies, and considering you've recently watched 'The Matrix' and 'Inception', you might enjoy 'Blade Runner 2049' for its deep narrative and stunning visuals.",
# Example adding notes + metadata
{"note": "This is a free-form note"},
),
(
{
"user_preferences": ["Drama", "Historical"],
"watch_history": ["The Crown", "Downton Abbey"],
"search_query": "Looking for a movie with a strong storyline",
},
"Given your interest in Drama and Historical themes, and your watch history, 'The King's Speech' offers a compelling storyline with remarkable performances.",
{"note": "This is another free_form note.", "cohort_number": 3},
),
(
{
"user_preferences": ["Comedy", "Romance"],
"watch_history": ["Friends", "The Big Bang Theory"],
"search_query": "Need a light-hearted movie",
},
"Considering your preference for Comedy and Romance, along with enjoying shows like 'Friends', you'd likely enjoy 'Crazy Rich Asians' for its humor and heartwarming romance.",
),
(
{
"user_preferences": ["Thriller", "Mystery"],
"watch_history": ["Sherlock", "Mindhunter"],
"search_query": "Suggest a suspenseful movie",
},
"With your taste leaning towards Thriller and Mystery, and considering you've watched 'Sherlock' and 'Mindhunter', 'Gone Girl' would be an excellent choice for its suspense and plot twists.",
),
(
{
"user_preferences": ["Documentary", "Nature"],
"watch_history": ["Planet Earth", "Blue Planet II"],
"search_query": "Want to watch something about wildlife",
},
"Your interest in Documentaries and Nature, along with watching 'Planet Earth' and 'Blue Planet II', suggests you would enjoy 'The Serengeti Rules', which beautifully captures wildlife and ecosystems.",
),
(
{
"user_preferences": ["Fantasy", "Adventure"],
"watch_history": ["Harry Potter series", "The Hobbit"],
"search_query": "Fantasy movies for the weekend?",
},
"Given your love for Fantasy and Adventure, having watched the 'Harry Potter series' and 'The Hobbit', 'The Witcher' series would be a fantastic choice for your weekend binge.",
),
(
{
"user_preferences": ["Animation", "Family"],
"watch_history": ["Finding Nemo", "Toy Story"],
"search_query": "Animated movies that are fun for all ages?",
},
"With a preference for Animation and Family-friendly content, and given your history with 'Finding Nemo' and 'Toy Story', 'Coco' is highly recommended for its fun story and universal appeal.",
),
(
{
"user_preferences": ["Horror", "Supernatural"],
"watch_history": ["The Haunting of Hill House", "Stranger Things"],
"search_query": "Scary movies that aren’t too gory?",
},
"As a fan of Horror and Supernatural genres, and having enjoyed 'The Haunting of Hill House' and 'Stranger Things', 'A Quiet Place' offers suspense without relying on gore.",
),
(
{
"user_preferences": ["Musical", "Drama"],
"watch_history": ["La La Land", "The Greatest Showman"],
"search_query": "Musicals with a strong emotional core?",
},
"Your enjoyment of Musicals and Drama, seen in 'La La Land' and 'The Greatest Showman', means you might find 'Les Misérables' to be a powerful experience with its deep emotional resonance.",
),
(
{
"user_preferences": ["Crime", "Legal Drama"],
"watch_history": ["Breaking Bad", "Better Call Saul"],
"search_query": "Engaging legal dramas?",
},
"Considering your interest in Crime and Legal Drama, with 'Breaking Bad' and 'Better Call Saul' in your watch history, 'The Trial of the Chicago 7' is recommended for its engaging narrative and historical significance.",
),
]
structured_input_dataset_name = "Structured Inputs"
if not client.has_dataset(dataset_name=structured_input_dataset_name):
structured_input_dataset = client.create_dataset(
dataset_name=structured_input_dataset_name
)
for input_tuple in structured_input_examples:
metadata = None
if len(input_tuple) == 3:
inputs, answer, metadata = input_tuple
else:
inputs, answer = input_tuple
client.create_example(
inputs=inputs,
outputs={"answer": answer},
dataset_id=structured_input_dataset.id,
metadata=metadata,
)
system_prompt_template = """Respond to the user's search query given what you know about them.
You know they just watched: {watch_history}
You know they have explicited stated preferences for: {user_preferences}"""
@traceable
def generate_recommendation(search_query, watch_history, user_preferences):
system_prompt = system_prompt_template.format(
watch_history=watch_history, user_preferences=user_preferences
)
messages = [
{"role": "system", "content": system_prompt},
] + [{"role": "user", "content": search_query}]
result = openai.chat.completions.create(
messages=messages,
model="gpt-3.5-turbo",
)
return result.choices[0].message.content
from langsmith.evaluation import LangChainStringEvaluator
# The evaluator expects a single input/prediction/reference.
# Our dataset has multiple inputs.
# You can configure the data the wrapped evaluator sees.
def prepare_eval_inputs(run, example):
return {
"input": example.inputs["search_query"],
"prediction": run.outputs["output"],
"reference": example.outputs["answer"],
}
correctness_evaluator = LangChainStringEvaluator(
"cot_qa", prepare_data=prepare_eval_inputs
)
structured_input_dataset_name = "Structured Inputs"
result = evaluate(
lambda inputs: generate_recommendation(**inputs),
data=structured_input_dataset_name,
evaluators=[correctness_evaluator],
experiment_prefix="Recommendations",
)
/var/folders/gf/6rnp_mbx5914kx7qmmh7xzmw0000gn/T/ipykernel_37884/2506121668.py:19: UserWarning: Function evaluate is in beta.
result = evaluate(
View the evaluation results for experiment: 'Recommendations:cb1089c' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/18b47447-9a35-4906-bacf-35548a0e8e57/compare?selectedSessions=168393b7-dbb0-4760-8814-af29e6708e40
0it [00:00, ?it/s]
Pt. 4 -- Dataset Versioning & Metadata
Every time an example is created, updated, or deleted, a new dataset version is saved and can be
retrieved by querying the examples as_of
that modified time.
You can save "semantic" versions of the dataset by tagging specific times with names.
A tag can be assigned to at most 1 version at a time.
import datetime
examples = list(client.list_examples(dataset_name=toxic_dataset_name))
initial_time = max([e.modified_at for e in examples])
len(examples)
13
example = client.create_example(
inputs={"text": "hi there"},
outputs={"label": "Not toxic"},
metadata={"recent": True},
dataset_name=toxic_dataset_name,
)
len(
list(
client.list_examples(
dataset_name=toxic_dataset_name,
as_of=datetime.datetime.now(tz=datetime.timezone.utc),
)
)
)
14
# Check for the time at which we first ran
len(
list(
client.list_examples(
dataset_name=toxic_dataset_name,
as_of=initial_time,
)
)
)
13
# You can tag a specific dataset version with a semantic name, like "prod"
client.update_dataset_tag(
dataset_name=toxic_dataset_name, as_of=initial_time, tag="prod"
)
# You can then query the dataset for that version
len(
list(
client.list_examples(
dataset_name=toxic_dataset_name,
as_of="prod",
)
)
)
13
from_version = "prod"
to_version = "latest"
diff = client.diff_dataset_versions(
dataset_name=toxic_dataset_name,
from_version=from_version,
to_version=to_version,
)
print(diff)
examples_modified=[] examples_added=[UUID('0a602ee7-e438-4ee6-a616-e7807cf2a373')] examples_removed=[]
# You can then use tags to continue to evaluate on the same version of a dataset
# Only updating your testing flow once you are ready to commit to a new version
result = evaluate(
lambda inputs: label_query(**inputs),
data=client.list_examples(dataset_name=toxic_dataset_name, as_of="prod"),
evaluators=[correct_label],
summary_evaluators=[summary_eval],
experiment_prefix="dataset versioning example",
metadata={
"prompt_version": "001",
},
)
/var/folders/gf/6rnp_mbx5914kx7qmmh7xzmw0000gn/T/ipykernel_37884/1734478713.py:3: UserWarning: Function evaluate is in beta.
result = evaluate(
View the evaluation results for experiment: 'dataset versioning example:856cee9' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/dc6cc406-d05e-4e57-9c47-5868523f5a98/compare?selectedSessions=b11e9800-ae10-47b7-ae67-54b8f21b8da4
0it [00:00, ?it/s]
Pt. 5 -- Proxy
from langsmith import traceable, wrappers
from openai import OpenAI
openai = wrappers.wrap_openai(
OpenAI(
base_url="http://localhost:8080/proxy/openai",
)
)
system_prompt = """Generate a three paragraph description of a movie about this topic: {topic}. Do not specify a title."""
@traceable
def generate_movie(topic):
messages = [
{"role": "user", "content": system_prompt.format(topic=topic)},
]
result = openai.chat.completions.create(messages=messages, model="gpt-4")
return result.choices[0].message.content
@traceable
def generate_title(description):
messages = [
{
"role": "user",
"content": f"Generate a title for the following movie description:\n\n{description}.",
},
]
result = openai.chat.completions.create(messages=messages, model="gpt-4")
return result.choices[0].message.content
@traceable
def pipeline(topic):
description = generate_movie(topic)
title = generate_title(description)
return {"description": description, "title": title}
movie_creation_examples = ["soccer", "a pop star", "action movie in venice"]
movie_creation_dataset_name = "Movie Creation"
if not client.has_dataset(dataset_name=movie_creation_dataset_name):
movie_dataset = client.create_dataset(dataset_name=movie_creation_dataset_name)
for topic in movie_creation_examples:
client.create_example(inputs={"topic": topic}, dataset_id=movie_dataset.id)
result = evaluate(
lambda inputs: pipeline(**inputs),
data=movie_creation_dataset_name,
experiment_prefix="cold cache",
metadata={
"prompt_version": "1",
},
)
View the evaluation results for project 'cold cache' at:
https://smith.langchain.com/o/8d28a774-8361-496d-a5d4-dd582a8d1b10/datasets/26db3352-e870-48b3-910b-e3cd003b0ab4/compare?selectedSessions=d8292996-db53-4ea8-92fd-6b0a3b26ec30
View all tests for Dataset Movie Creation at:
https://smith.langchain.com/o/8d28a774-8361-496d-a5d4-dd582a8d1b10/datasets/26db3352-e870-48b3-910b-e3cd003b0ab4
[------------------------------------------------->] 3/3
@traceable
def generate_title(description):
messages = [
{
"role": "user",
"content": f"Generate a title in SPANISH for the following movie description:\n\n{description}.",
},
]
result = openai.chat.completions.create(messages=messages, model="gpt-4")
return result.choices[0].message.content
@traceable
def pipeline(topic):
description = generate_movie(topic)
title = generate_title(description)
return {"description": description, "title": title}
result = evaluate(
lambda inputs: pipeline(inputs["topic"]),
data=movie_creation_dataset_name,
experiment_prefix="warm cache",
metadata={
"prompt_version": "2",
},
)
View the evaluation results for project 'warm cache' at:
https://smith.langchain.com/o/8d28a774-8361-496d-a5d4-dd582a8d1b10/datasets/26db3352-e870-48b3-910b-e3cd003b0ab4/compare?selectedSessions=e5b69871-cd7f-433b-a29f-4a75bf7b4094
View all tests for Dataset Movie Creation at:
https://smith.langchain.com/o/8d28a774-8361-496d-a5d4-dd582a8d1b10/datasets/26db3352-e870-48b3-910b-e3cd003b0ab4
[------------------------------------------------->] 3/3