From 1b5997defcf5248974ea4ed133552371a7ea02fb Mon Sep 17 00:00:00 2001 From: caizer0x Date: Tue, 25 Mar 2025 12:19:10 -0400 Subject: [PATCH 1/4] Add multi turn evals Add more complex evals setup basic eval for transfers Added evals for all langchain tools change test name remove duplicate fix imports change location m remove unecessary changes get rid of change to package.json --- .../gibwork/multi_create_gibwork_task_eval.ts | 38 ++ .../evals/jupiter/multi_token_data_eval.ts | 45 ++ .../evals/jupiter/multi_token_swap_eval.ts | 38 ++ .../multi_solana_deploy_collection_eval.ts | 35 ++ src/langchain/evals/multi/basics.evals.ts | 400 ++++++++++++++++++ ...ulti_solana_openbook_create_market_eval.ts | 29 ++ .../multi_pumpfun_token_launch_eval.ts | 29 ++ .../evals/solana/multi_balance_other_eval.ts | 30 ++ .../evals/solana/multi_solana_restake_eval.ts | 17 + .../evals/solana/multi_transfer_eval.ts | 17 + .../multi_solana_cancel_nft_listing_eval.ts | 22 + .../multi_solana_list_nft_for_sale_eval.ts | 26 ++ .../tiplink/multi_solana_tiplink_eval.ts | 19 + src/langchain/evals/utils/runEvals.ts | 148 ++++++- 14 files changed, 870 insertions(+), 23 deletions(-) create mode 100644 src/langchain/evals/gibwork/multi_create_gibwork_task_eval.ts create mode 100644 src/langchain/evals/jupiter/multi_token_data_eval.ts create mode 100644 src/langchain/evals/jupiter/multi_token_swap_eval.ts create mode 100644 src/langchain/evals/metaplex/multi_solana_deploy_collection_eval.ts create mode 100644 src/langchain/evals/multi/basics.evals.ts create mode 100644 src/langchain/evals/openbook/multi_solana_openbook_create_market_eval.ts create mode 100644 src/langchain/evals/pumpfun/multi_pumpfun_token_launch_eval.ts create mode 100644 src/langchain/evals/solana/multi_balance_other_eval.ts create mode 100644 src/langchain/evals/solana/multi_solana_restake_eval.ts create mode 100644 src/langchain/evals/solana/multi_transfer_eval.ts create mode 100644 src/langchain/evals/tensor/multi_solana_cancel_nft_listing_eval.ts create mode 100644 src/langchain/evals/tensor/multi_solana_list_nft_for_sale_eval.ts create mode 100644 src/langchain/evals/tiplink/multi_solana_tiplink_eval.ts diff --git a/src/langchain/evals/gibwork/multi_create_gibwork_task_eval.ts b/src/langchain/evals/gibwork/multi_create_gibwork_task_eval.ts new file mode 100644 index 000000000..d5c0d8157 --- /dev/null +++ b/src/langchain/evals/gibwork/multi_create_gibwork_task_eval.ts @@ -0,0 +1,38 @@ +import { runComplexEval, ComplexEvalDataset } from "../utils/runEvals"; + +const DATASET: ComplexEvalDataset[] = [ + { + description: "Multi-turn create Gibwork task", + inputs: { + query: "I need to create a new Gibwork task", + }, + turns: [ + { input: "I need to create a new Gibwork task" }, + { input: "The task is titled 'Fix my website'" }, + { + input: "Also, what's the current price of JUP?", + expectedToolCall: { + tool: "solana_token_data", + params: "JUPyiwrYJFskUPiHa7hkeR8VUtAeFoSYbKedZNsDvCN", + }, + }, + { input: "It should be for 1000 JUP tokens with no extra content" }, + { + input: "Set content and requirements to N/A and tag it as webdev", + expectedToolCall: { + tool: "create_gibwork_task", + params: { + title: "Fix my website", + content: "N/A", + requirements: "N/A", + tags: ["webdev"], + tokenMintAddress: "JUPyiwrYJFskUPiHa7hkeR8VUtAeFoSYbKedZNsDvCN", + amount: 10, + }, + }, + }, + ], + }, +]; + +runComplexEval(DATASET, "Multi-turn Create Gibwork Task test"); diff --git a/src/langchain/evals/jupiter/multi_token_data_eval.ts b/src/langchain/evals/jupiter/multi_token_data_eval.ts new file mode 100644 index 000000000..387517760 --- /dev/null +++ b/src/langchain/evals/jupiter/multi_token_data_eval.ts @@ -0,0 +1,45 @@ +import { runComplexEval, ComplexEvalDataset } from "../utils/runEvals"; + +const DATASET: ComplexEvalDataset[] = [ + { + description: "Multi-turn token data inquiry", + inputs: { + query: "What's the price of KING?", + }, + turns: [ + { input: "What's the price of KING?" }, + { + input: + "The mint address is 5eqNDjbsWL9hfAqUfhegTxgEa3XardzGdVAboMA4pump", + expectedToolCall: { + tool: "solana_token_data", + params: "5eqNDjbsWL9hfAqUfhegTxgEa3XardzGdVAboMA4pump", + }, + }, + { + input: "Buy 20 tokens using USDC", + expectedToolCall: { + tool: "solana_trade", + params: { + inputAmount: 20, + inputMint: "EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v", + outputMint: "5eqNDjbsWL9hfAqUfhegTxgEa3XardzGdVAboMA4pump", + slippageBps: 100, + }, + }, + }, + { + input: "And check my KING balance", + expectedToolCall: { + tool: "solana_balance_other", + params: { + walletAddress: "GZbQmKYYzwjP3nbdqRWPLn98ipAni9w5eXMGp7bmZbGB", + tokenAddress: "5eqNDjbsWL9hfAqUfhegTxgEa3XardzGdVAboMA4pump", + }, + }, + }, + ], + }, +]; + +runComplexEval(DATASET, "Multi-turn Token Data test"); diff --git a/src/langchain/evals/jupiter/multi_token_swap_eval.ts b/src/langchain/evals/jupiter/multi_token_swap_eval.ts new file mode 100644 index 000000000..8d9e9eaaa --- /dev/null +++ b/src/langchain/evals/jupiter/multi_token_swap_eval.ts @@ -0,0 +1,38 @@ +import { runComplexEval, ComplexEvalDataset } from "../utils/runEvals"; + +const DATASET: ComplexEvalDataset[] = [ + { + description: "Multi-turn token swap", + inputs: { + query: "I want to swap some tokens", + }, + turns: [ + { input: "I want to swap some tokens" }, + { input: "I want to exchange USDC for JUP tokens" }, + { + input: "Swap 10 USDC for JUP with 1% slippage", + expectedToolCall: { + tool: "solana_trade", + params: { + outputMint: "JUPyiwrYJFskUPiHa7hkeR8VUtAeFoSYbKedZNsDvCN", + inputAmount: 10, + inputMint: "EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v", + slippageBps: 100, + }, + }, + }, + { + input: + "Then check the USDC balance of GZbQmKYYzwjP3nbdqRWPLn98ipAni9w5eXMGp7bmZbGB", + expectedToolCall: { + tool: "solana_balance_other", + params: { + tokenAddress: "EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v", + }, + }, + }, + ], + }, +]; + +runComplexEval(DATASET, "Multi-turn Token Swap test"); diff --git a/src/langchain/evals/metaplex/multi_solana_deploy_collection_eval.ts b/src/langchain/evals/metaplex/multi_solana_deploy_collection_eval.ts new file mode 100644 index 000000000..2bd993753 --- /dev/null +++ b/src/langchain/evals/metaplex/multi_solana_deploy_collection_eval.ts @@ -0,0 +1,35 @@ +import { runComplexEval, ComplexEvalDataset } from "../utils/runEvals"; + +const DATASET: ComplexEvalDataset[] = [ + { + description: "Multi-turn NFT collection deployment", + inputs: { + query: "I want to deploy an NFT collection", + }, + turns: [ + { input: "I want to deploy an NFT collection" }, + { input: "The collection should be named MyCollection" }, + { + input: + "Its metadata URI is https://metadata.mycoll.io/collection.json. Set the royalty to 250 basis points", + expectedToolCall: { + tool: "solana_deploy_collection", + params: { + name: "MyCollection", + uri: "https://metadata.mycoll.io/collection.json", + royaltyBasisPoints: 250, + }, + }, + }, + { + input: "Also, retrieve the deployed collection details", + expectedToolCall: { + tool: "get_asset", + params: { collection: "MyCollection" }, + }, + }, + ], + }, +]; + +runComplexEval(DATASET, "Multi-turn solana_deploy_collection test"); diff --git a/src/langchain/evals/multi/basics.evals.ts b/src/langchain/evals/multi/basics.evals.ts new file mode 100644 index 000000000..362a12107 --- /dev/null +++ b/src/langchain/evals/multi/basics.evals.ts @@ -0,0 +1,400 @@ +import { runComplexEval, ComplexEvalDataset } from "../utils/runEvals"; + +const DATASET: ComplexEvalDataset[] = [ + { + description: + "Multi-turn flow: Check JUP price, check JUP balance, sell all JUP, stake SOL", + inputs: { + query: "I want to manage my JUP and SOL", + }, + turns: [ + { + input: "What’s the current price of JUP?", + expectedToolCall: { + tool: "solana_fetch_price", + params: { address: "JUPyiwrYJFskUPiHa7hkeR8VUtAeFoSYbKedZNsDvCN" }, + }, + }, + { + input: "How much JUP do I have?", + expectedToolCall: { + tool: "solana_balance", + params: { + tokenAddress: "JUPyiwrYJFskUPiHa7hkeR8VUtAeFoSYbKedZNsDvCN", + }, + }, + }, + { + input: "Sell all my JUP for SOL. Try it even if the balance is 0.", + expectedToolCall: { + tool: "jupiter_trade", + params: { + inputMint: "JUPyiwrYJFskUPiHa7hkeR8VUtAeFoSYbKedZNsDvCN", + outputMint: "So11111111111111111111111111111111111111112", + // amount: "all", // leave out this field since we dont know the amount + slippage: 0.5, + }, + }, + }, + { + input: "Stake 1 SOL", + expectedToolCall: { + tool: "solana_restake", + params: { amount: 1 }, + }, + }, + ], + }, + { + description: "Multi-turn flow: Check balance and send SOL and tokens", + inputs: { + query: "I want to send some SOL to a friend", + }, + turns: [ + { + input: "How much SOL do I have?", + expectedToolCall: { + tool: "solana_balance", + params: {}, + }, + }, + { + input: + "Transfer 0.1 SOL to GZbQmKYYzwjP3nbdqRWPLn98ipAni9w5eXMGp7bmZbGB", + expectedToolCall: { + tool: "solana_transfer", + params: { + to: "GZbQmKYYzwjP3nbdqRWPLn98ipAni9w5eXMGp7bmZbGB", + amount: 0.1, + }, + }, + }, + { + input: "Check my SOL balance again", + expectedToolCall: { + tool: "solana_balance", + params: {}, + }, + }, + { + input: "How much JUP do I have?", + expectedToolCall: { + tool: "solana_balance", + params: { + tokenAddress: "JUPyiwrYJFskUPiHa7hkeR8VUtAeFoSYbKedZNsDvCN", + }, + }, + }, + { + input: "Send it all to GZbQmKYYzwjP3nbdqRWPLn98ipAni9w5eXMGp7bmZbGB", + expectedToolCall: { + tool: "solana_transfer", + params: { + to: "GZbQmKYYzwjP3nbdqRWPLn98ipAni9w5eXMGp7bmZbGB", + amount: 0.1, + tokenAddress: "JUPyiwrYJFskUPiHa7hkeR8VUtAeFoSYbKedZNsDvCN", + }, + }, + }, + ], + }, + { + description: "Multi-turn flow: Check balance, mint NFT, verify assets", + inputs: { + query: "I want to create an NFT", + }, + turns: [ + { + input: "How much SOL do I have? I need some to mint NFTs?", + expectedToolCall: { + tool: "solana_balance", + params: {}, + }, + }, + { + input: + "Mint an NFT with name 'MyFirstNFT' and symbol 'MFN', uri: https://example.com/nft.json.", + expectedToolCall: { + tool: "solana_mint_nft", + params: { + name: "MyFirstNFT", + symbol: "MFN", + uri: "https://example.com/nft.json", + }, + }, + }, + { + input: "Check my assets to see the new NFT", + expectedToolCall: { + tool: "solana_get_all_assets_by_owner", + params: {}, + }, + }, + ], + }, + { + description: "Multi-turn flow: Create multisig, deposit SOL", + inputs: { + query: "I want to set up a multisig wallet", + }, + turns: [ + { + input: + "Create a 2-of-2 multisig with GZbQmKYYzwjP3nbdqRWPLn98ipAni9w5eXMGp7bmZbGB", + expectedToolCall: { + tool: "create_2by2_multisig", + params: { creator: "GZbQmKYYzwjP3nbdqRWPLn98ipAni9w5eXMGp7bmZbGB" }, + }, + }, + { + input: "Deposit 1 SOL into the multisig", + expectedToolCall: { + tool: "deposit_to_2by2_multisig", + params: { amount: 1 }, + }, + }, + ], + }, + { + description: + "Multi-turn flow: Price check, buy tokens, check balance, stake SOL", + inputs: { + query: + "Check the price of USDC, buy 10 USDC using SOL, check my USDC balance, then stake 0.5 SOL.", + }, + turns: [ + { + input: "What's the price of USDC?", + expectedToolCall: { + tool: "solana_fetch_price", + params: "EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v", + }, + }, + { + input: "Buy 10 USDC with 2% slippage using my SOL", + expectedToolCall: { + tool: "solana_trade", + params: { + outputMint: "EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v", + inputAmount: 10, + inputMint: "So11111111111111111111111111111111111111112", + slippageBps: 200, + }, + }, + }, + { + input: "Now check my USDC balance", + expectedToolCall: { + tool: "solana_balance", + params: JSON.stringify({ + tokenAddres: "EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v", + }), + }, + }, + { + input: "Stake 0.5 SOL", + expectedToolCall: { + tool: "solana_stake", + params: JSON.stringify({ amount: 0.5 }), + }, + }, + ], + }, + { + description: + "Use a Drift Vault: create vault, deposit USDC, check vault info", + inputs: { + query: "I want to create and deposit into a new drift vault.", + }, + turns: [ + { + input: + "Create a drift vault named 'LeverageVault' with redeemPeriod=2 days and profitShare=15", + expectedToolCall: { + tool: "create_drift_vault", + params: { + name: "LeverageVault", + redeemPeriod: 2, + profitShare: 15, + }, + }, + }, + { + input: "Deposit 200 USDC into the vault", + expectedToolCall: { + tool: "deposit_into_drift_vault", + params: { + vaultAddress: "LeverageVault", + amount: 200, + }, + }, + }, + { + input: "Show me info about vault named 'LeverageVault'", + expectedToolCall: { + tool: "drift_vault_info", + params: { + vaultNameOrAddress: "LeverageVault", + }, + }, + }, + ], + }, + { + description: "Multi-turn flow: get a token's info, then trade it for SOL", + inputs: { + query: "I want to check a token's info, then sell it for SOL.", + }, + turns: [ + { + input: + "What is the token info for mint address JUPyiwrYJFskUPiHa7hkeR8VUtAeFoSYbKedZNsDvCN?", + expectedToolCall: { + tool: "solana_token_data", + params: "JUPyiwrYJFskUPiHa7hkeR8VUtAeFoSYbKedZNsDvCN", + }, + }, + { + input: "How much of that token do I hold?", + expectedToolCall: { + tool: "solana_balance", + params: JSON.stringify({ + tokenAddress: "JUPyiwrYJFskUPiHa7hkeR8VUtAeFoSYbKedZNsDvCN", + }), + }, + }, + { + input: "Sell all of my JUP for SOL with a 1% slippage tolerance.", + expectedToolCall: { + tool: "solana_trade", + params: { + inputMint: "JUPyiwrYJFskUPiHa7hkeR8VUtAeFoSYbKedZNsDvCN", + outputMint: "So11111111111111111111111111111111111111112", + slippageBps: 100, + }, + }, + }, + ], + }, + { + description: + "Multi-turn flow: send tokens to multiple addresses and check final balances", + inputs: { + query: "I want to distribute some tokens to multiple wallets", + }, + turns: [ + { + input: + "How many tokens do I have at mint EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v?", + expectedToolCall: { + tool: "solana_balance", + params: JSON.stringify({ + tokenAddress: "EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v", + }), + }, + }, + { + input: + "Send 50 of those tokens to wallet GZbQmKYYzwjP3nbdqRWPLn98ipAni9w5eXMGp7bmZbGB", + expectedToolCall: { + tool: "solana_transfer", + params: { + to: "GZbQmKYYzwjP3nbdqRWPLn98ipAni9w5eXMGp7bmZbGB", + amount: 50, + mint: "EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v", + }, + }, + }, + { + input: + "Then send 25 tokens to 9Sx1apT66k8Ne5TP8PFua5w9DCQ8HztqZ4ZGh9Ejp2x2", + expectedToolCall: { + tool: "solana_transfer", + params: { + to: "9Sx1apT66k8Ne5TP8PFua5w9DCQ8HztqZ4ZGh9Ejp2x2", + amount: 25, + mint: "EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v", + }, + }, + }, + { + input: "Check how many tokens I have left", + expectedToolCall: { + tool: "solana_balance", + params: JSON.stringify({ + tokenAddress: "EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v", + }), + }, + }, + ], + }, + { + description: + "Multi-turn flow: buy tokens with SOL, then stake them, re-check SOL balance", + inputs: { + query: + "I want to buy some tokens, stake them, and see how much SOL remains.", + }, + turns: [ + { + input: "Buy 10 USDC with my SOL using 2% slippage.", + expectedToolCall: { + tool: "solana_trade", + params: { + outputMint: "EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v", + inputAmount: 10, + inputMint: "So11111111111111111111111111111111111111112", + slippageBps: 200, + }, + }, + }, + { + input: "Now stake 1 SOL", + expectedToolCall: { + tool: "solana_stake", + params: JSON.stringify({ amount: 1 }), + }, + }, + { + input: "Check how much SOL I have left after staking", + expectedToolCall: { + tool: "solana_balance", + params: "{}", + }, + }, + ], + }, + { + description: + "Multi-turn flow: stake 2.5 SOL if I have enough, otherwise request from faucet, then re-check SOL balance", + inputs: { + query: "I want to stake 2.5 SOL. Not sure if I have enough though.", + }, + turns: [ + { + input: "How many SOL do I have?", + expectedToolCall: { + tool: "solana_balance", + params: "{}", + }, + }, + { + input: + "If it's under 2.5, please request from faucet, else let's proceed.", + expectedToolCall: { + tool: "solana_request_funds", + params: "{}", + }, + }, + { + input: "Check how much SOL I have now", + expectedToolCall: { + tool: "solana_balance", + params: "{}", + }, + }, + ], + }, +]; + +runComplexEval(DATASET, "Multi-turn Basic User Flows test"); diff --git a/src/langchain/evals/openbook/multi_solana_openbook_create_market_eval.ts b/src/langchain/evals/openbook/multi_solana_openbook_create_market_eval.ts new file mode 100644 index 000000000..fb4770103 --- /dev/null +++ b/src/langchain/evals/openbook/multi_solana_openbook_create_market_eval.ts @@ -0,0 +1,29 @@ +import { runComplexEval, ComplexEvalDataset } from "../utils/runEvals"; + +const DATASET: ComplexEvalDataset[] = [ + { + description: "Multi-turn openbook market creation", + inputs: { + query: "I need to create a new openbook market", + }, + turns: [ + { input: "I need to create a new openbook market" }, + { input: "Let’s use SOL as the base mint" }, + { input: "And USDC as the quote mint" }, + { + input: "Set the lot size to 100 and tick size to 1.5", + expectedToolCall: { + tool: "solana_openbook_create_market", + params: { + baseMint: "So11111111111111111111111111111111111111112", + quoteMint: "EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v", + lotSize: 100, + tickSize: 1.5, + }, + }, + }, + ], + }, +]; + +runComplexEval(DATASET, "Multi-turn solana_openbook_create_market test"); diff --git a/src/langchain/evals/pumpfun/multi_pumpfun_token_launch_eval.ts b/src/langchain/evals/pumpfun/multi_pumpfun_token_launch_eval.ts new file mode 100644 index 000000000..f4d5129e4 --- /dev/null +++ b/src/langchain/evals/pumpfun/multi_pumpfun_token_launch_eval.ts @@ -0,0 +1,29 @@ +import { runComplexEval, ComplexEvalDataset } from "../utils/runEvals"; + +const DATASET: ComplexEvalDataset[] = [ + { + description: "Multi-turn PumpFun token launch", + inputs: { + query: "I want to launch a new PumpFun token", + }, + turns: [ + { input: "I want to launch a new PumpFun token" }, + { input: "I want it to be called YOLO" }, + { input: "The ticker should be YOLO and description 'yolo token'" }, + { + input: "Use the image URL https://example.com/yolo.png", + expectedToolCall: { + tool: "solana_launch_pumpfun_token", + params: { + tokenName: "YOLO", + tokenTicker: "YOLO", + description: "yolo token", + imageUrl: "https://example.com/yolo.png", + }, + }, + }, + ], + }, +]; + +runComplexEval(DATASET, "Multi-turn PumpFun token launch test"); diff --git a/src/langchain/evals/solana/multi_balance_other_eval.ts b/src/langchain/evals/solana/multi_balance_other_eval.ts new file mode 100644 index 000000000..506105363 --- /dev/null +++ b/src/langchain/evals/solana/multi_balance_other_eval.ts @@ -0,0 +1,30 @@ +import { runComplexEval, ComplexEvalDataset } from "../utils/runEvals"; + +const DATASET: ComplexEvalDataset[] = [ + { + description: "Multi-turn USDC balance for another wallet", + inputs: { + query: "I want to check my friend's balance", + }, + turns: [ + { input: "Check my friend's USDC balance" }, + { + input: + "The wallet address is GZbQmKYYzwjP3nbdqRWPLn98ipAni9w5eXMGp7bmZbGB", + expectedToolCall: { + tool: "solana_balance_other", + params: { + walletAddress: "GZbQmKYYzwjP3nbdqRWPLn98ipAni9w5eXMGp7bmZbGB", + tokenAddress: "EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v", + }, + }, + }, + { + input: "Also, check my SOL balance", + expectedToolCall: { tool: "solana_balance", params: {} }, + }, + ], + }, +]; + +runComplexEval(DATASET, "Multi-turn Balance Other test"); diff --git a/src/langchain/evals/solana/multi_solana_restake_eval.ts b/src/langchain/evals/solana/multi_solana_restake_eval.ts new file mode 100644 index 000000000..f00c858be --- /dev/null +++ b/src/langchain/evals/solana/multi_solana_restake_eval.ts @@ -0,0 +1,17 @@ +import { runComplexEval, ComplexEvalDataset } from "../utils/runEvals"; + +const DATASET: ComplexEvalDataset[] = [ + { + description: "Multi-turn restake SOL", + inputs: { + query: "I want to restake my SOL" + }, + turns: [ + { input: "I want to restake my SOL" }, + { input: "Please restake 1.5 SOL for me", expectedToolCall: { tool: "solana_restake", params: { amount: 1.5 } } }, + { input: "Then check my updated SOL balance", expectedToolCall: { tool: "solana_balance", params: {} } } + ] + } +]; + +runComplexEval(DATASET, "Multi-turn Restake test"); \ No newline at end of file diff --git a/src/langchain/evals/solana/multi_transfer_eval.ts b/src/langchain/evals/solana/multi_transfer_eval.ts new file mode 100644 index 000000000..5e78677a9 --- /dev/null +++ b/src/langchain/evals/solana/multi_transfer_eval.ts @@ -0,0 +1,17 @@ +import { runComplexEval, ComplexEvalDataset } from "../utils/runEvals"; + +const DATASET: ComplexEvalDataset[] = [ + { + description: "Multi-turn SOL transfer", + inputs: { + query: "I want to send some SOL" + }, + turns: [ + { input: "I want to send some SOL" }, + { input: "Please transfer 0.05 SOL" }, + { input: "Send it to wallet GZbQmKYYzwjP3nbdqRWPLn98ipAni9w5eXMGp7bmZbGB", expectedToolCall: { tool: "solana_transfer", params: { to: "GZbQmKYYzwjP3nbdqRWPLn98ipAni9w5eXMGp7bmZbGB", amount: 0.05 } } } + ] + } +]; + +runComplexEval(DATASET, "Multi-turn Transfer test"); \ No newline at end of file diff --git a/src/langchain/evals/tensor/multi_solana_cancel_nft_listing_eval.ts b/src/langchain/evals/tensor/multi_solana_cancel_nft_listing_eval.ts new file mode 100644 index 000000000..99a719b0d --- /dev/null +++ b/src/langchain/evals/tensor/multi_solana_cancel_nft_listing_eval.ts @@ -0,0 +1,22 @@ +import { runComplexEval, ComplexEvalDataset } from "../utils/runEvals"; + +const DATASET: ComplexEvalDataset[] = [ + { + description: "Multi-turn cancellation of NFT listing on Tensor", + inputs: { + query: "I need to cancel my NFT listing", + }, + turns: [ + { input: "I need to cancel my NFT listing" }, + { + input: "Cancel the listing for my NFT with mint 4KG7k12", + expectedToolCall: { + tool: "solana_cancel_nft_listing", + params: { nftMint: "4KG7k12" }, + }, + }, + ], + }, +]; + +runComplexEval(DATASET, "Multi-turn solana_cancel_nft_listing test"); diff --git a/src/langchain/evals/tensor/multi_solana_list_nft_for_sale_eval.ts b/src/langchain/evals/tensor/multi_solana_list_nft_for_sale_eval.ts new file mode 100644 index 000000000..58737e374 --- /dev/null +++ b/src/langchain/evals/tensor/multi_solana_list_nft_for_sale_eval.ts @@ -0,0 +1,26 @@ +import { runComplexEval, ComplexEvalDataset } from "../utils/runEvals"; + +const DATASET: ComplexEvalDataset[] = [ + { + description: "Multi-turn NFT listing for sale on Tensor", + inputs: { + query: "I want to list an NFT for sale", + }, + turns: [ + { input: "I want to list my NFT for sale" }, + { input: "My NFT mint is DDYCpHiiu83DqkG7aaqiUz77rchXx2f4h6BUQAP9Xwcm" }, + { + input: "Please list it for 2.5 SOL", + expectedToolCall: { + tool: "solana_list_nft_for_sale", + params: { + nftMint: "DDYCpHiiu83DqkG7aaqiUz77rchXx2f4h6BUQAP9Xwcm", + price: 2.5, + }, + }, + }, + ], + }, +]; + +runComplexEval(DATASET, "Multi-turn solana_list_nft_for_sale test"); diff --git a/src/langchain/evals/tiplink/multi_solana_tiplink_eval.ts b/src/langchain/evals/tiplink/multi_solana_tiplink_eval.ts new file mode 100644 index 000000000..4555b294e --- /dev/null +++ b/src/langchain/evals/tiplink/multi_solana_tiplink_eval.ts @@ -0,0 +1,19 @@ +import { runComplexEval, ComplexEvalDataset } from "../utils/runEvals"; + +const DATASET: ComplexEvalDataset[] = [ + { + description: "Multi-turn tiplink creation for SOL", + inputs: { + query: "I want to create a tiplink", + }, + turns: [ + { input: "I want to create a tiplink" }, + { + input: "Tip 0.5 SOL", + expectedToolCall: { tool: "solana_tiplink", params: { amount: 0.5 } }, + }, + ], + }, +]; + +runComplexEval(DATASET, "Multi-turn solana_tiplink test"); diff --git a/src/langchain/evals/utils/runEvals.ts b/src/langchain/evals/utils/runEvals.ts index 0fbb37154..5c84bed74 100644 --- a/src/langchain/evals/utils/runEvals.ts +++ b/src/langchain/evals/utils/runEvals.ts @@ -50,7 +50,7 @@ function deepCmp(referenceArguments: any, llmArguments: any): boolean { if (!referenceKeys.every((key) => llmKeys.includes(key))) { return false; } - // Only comparse keys from reference (all mandatory) since the llm may return optional parameters that may not be in reference + // Only compare keys from reference (all mandatory) since the llm may return optional parameters that may not be in reference return referenceKeys.every((key) => deepCmp(referenceArguments[key], llmArguments[key]), ); @@ -58,12 +58,11 @@ function deepCmp(referenceArguments: any, llmArguments: any): boolean { function compareArgs( referenceAnswer: { tool: string; response: string }, - llmAnswer: { tool: string; response: string | undefined }, ): boolean { if (!llmAnswer.response || !referenceAnswer.response) return false; - // Repsonses can be just strings (single argument) or KV of parameter names and arguments + // Responses can be just strings (single argument) or KV of parameter names and arguments let parsedReferenceResponse = referenceAnswer.response.startsWith("{") ? JSON.parse(referenceAnswer.response) : referenceAnswer.response; @@ -76,12 +75,30 @@ function compareArgs( function compareTools( referenceAnswer: { tool: string; response: string }, - llmAnswer: { tool: string; response: string | undefined }, ): boolean { return llmAnswer.tool === referenceAnswer.tool; } +const toolEvaluator = async (params: { + referenceOutputs: { tool: string; response: string }; + llmAnswer: { tool: string; response: string }; +}) => { + return { + key: "correct_tool", + score: compareTools(params.referenceOutputs, params.llmAnswer), + }; +}; +const argsEvaluator = async (params: { + referenceOutputs: { tool: string; response: string }; + llmAnswer: { tool: string; response: string }; +}) => { + return { + key: "correct_args", + score: compareArgs(params.referenceOutputs, params.llmAnswer), + }; +}; + export async function runEvals( dataset: { inputs: { query: string }; @@ -128,25 +145,6 @@ export async function runEvals( compareArgs(referenceOutputs, llmAnswer) && compareTools(referenceOutputs, llmAnswer); - const toolEvaluator = async (params: { - referenceOutputs: { tool: string; response: string }; - llmAnswer: { tool: string; response: string }; - }) => { - return { - key: "correct_tool", - score: compareArgs(params.referenceOutputs, params.llmAnswer), - }; - }; - const argsEvaluator = async (params: { - referenceOutputs: { tool: string; response: string }; - llmAnswer: { tool: string; response: string }; - }) => { - return { - key: "correct_args", - score: compareArgs(params.referenceOutputs, params.llmAnswer), - }; - }; - const wrappedToolEvaluator = ls.wrapEvaluator(toolEvaluator); await wrappedToolEvaluator({ referenceOutputs, @@ -164,3 +162,107 @@ export async function runEvals( ); }); } + +export type ConversationTurn = { + input: string; + expectedToolCall?: { + tool: string; + params: any; + }; +}; + +export type ComplexEvalDataset = { + description: string; + turns: ConversationTurn[]; + inputs: Record; +}; + +export async function runComplexEval( + dataset: ComplexEvalDataset[], + testName: string, +) { + ls.describe(testName, () => { + ls.test.each(dataset as any)(testName, async (scenario) => { + const conversation: Array<{ + role: string; + content: string | null; + tool_calls?: any; + }> = []; + let foundCorrectToolCall = true; + + for (let i = 0; i < scenario.turns.length; i++) { + const turn = scenario.turns[i]; + conversation.push({ role: "user", content: turn.input }); + + const result = await agent.invoke( + { messages: conversation }, + { + configurable: { + thread_id: `${testName}-${new Date().toISOString()}`, // Need unique thread-id to keep context seperate betweet tests + }, + }, + ); + + ls.logOutputs(result); + const assistantMessage = result.messages[result.messages.length - 1]; + conversation.push(assistantMessage); + // conversation.forEach((message) => console.log(message.content)); + if ( + turn.expectedToolCall && + !( + assistantMessage.tool_calls && + assistantMessage.tool_calls.length > 0 + ) + ) { + foundCorrectToolCall = false; + continue; + } + if ( + assistantMessage.tool_calls && + assistantMessage.tool_calls.length > 0 && + turn.expectedToolCall + ) { + const toolCall = assistantMessage.tool_calls[0]; + + const toolName = toolCall?.name || ""; + const llmArgs = toolCall.args.input; + const toolArgs: string = typeof llmArgs === "string" ? llmArgs : "{}"; + const params = turn.expectedToolCall.params; + + if (toolName === turn.expectedToolCall.tool) { + const referenceOutputs = { + tool: turn.expectedToolCall.tool, + response: + typeof params === "string" + ? params + : JSON.stringify(turn.expectedToolCall.params), + }; + const llmAnswer: { tool: string; response: string } = { + tool: toolName, + response: toolArgs, + }; + + const argsMatch = compareArgs(referenceOutputs, llmAnswer); + const toolMatches = compareTools(referenceOutputs, llmAnswer); + + foundCorrectToolCall = + foundCorrectToolCall && argsMatch && toolMatches; // && so if it fails on one tool the whole test fails + + const wrappedToolEvaluator = ls.wrapEvaluator(toolEvaluator); + await wrappedToolEvaluator({ + referenceOutputs, + llmAnswer, + }); + + const wrappedArgsEvaluator = ls.wrapEvaluator(argsEvaluator); + await wrappedArgsEvaluator({ + referenceOutputs, + llmAnswer, + }); + } + } + } + expect(foundCorrectToolCall).toBe(true); + }); + }); +} From 94fffa885e451312da699f1af72aa93fb4a66358 Mon Sep 17 00:00:00 2001 From: caizer0x Date: Wed, 26 Mar 2025 13:35:38 -0400 Subject: [PATCH 2/4] fix naming scheme --- ...l.ts => multi_create_gibwork_task.eval.ts} | 0 ..._data_eval.ts => multi_token_data.eval.ts} | 0 ..._swap_eval.ts => multi_token_swap.eval.ts} | 0 ...=> multi_solana_deploy_collection.eval.ts} | 0 .../{basics.evals.ts => multi_basics.eval.ts} | 0 ...lti_solana_openbook_create.market_eval.ts} | 0 ....ts => multi_pumpfun_token_launch.eval.ts} | 0 ...er_eval.ts => multi_balance_other.eval.ts} | 0 ...e_eval.ts => multi_solana_restake.eval.ts} | 0 ...ransfer_eval.ts => multi_transfer.eval.ts} | 0 ...> multi_solana_cancel_nft_listing.eval.ts} | 0 ...=> multi_solana_list_nft_for_sale.eval.ts} | 0 ...k_eval.ts => multi_solana_tiplink.eval.ts} | 0 src/langchain/evals/utils/runEvals.ts | 138 +++++++++--------- 14 files changed, 71 insertions(+), 67 deletions(-) rename src/langchain/evals/gibwork/{multi_create_gibwork_task_eval.ts => multi_create_gibwork_task.eval.ts} (100%) rename src/langchain/evals/jupiter/{multi_token_data_eval.ts => multi_token_data.eval.ts} (100%) rename src/langchain/evals/jupiter/{multi_token_swap_eval.ts => multi_token_swap.eval.ts} (100%) rename src/langchain/evals/metaplex/{multi_solana_deploy_collection_eval.ts => multi_solana_deploy_collection.eval.ts} (100%) rename src/langchain/evals/multi/{basics.evals.ts => multi_basics.eval.ts} (100%) rename src/langchain/evals/openbook/{multi_solana_openbook_create_market_eval.ts => multi_solana_openbook_create.market_eval.ts} (100%) rename src/langchain/evals/pumpfun/{multi_pumpfun_token_launch_eval.ts => multi_pumpfun_token_launch.eval.ts} (100%) rename src/langchain/evals/solana/{multi_balance_other_eval.ts => multi_balance_other.eval.ts} (100%) rename src/langchain/evals/solana/{multi_solana_restake_eval.ts => multi_solana_restake.eval.ts} (100%) rename src/langchain/evals/solana/{multi_transfer_eval.ts => multi_transfer.eval.ts} (100%) rename src/langchain/evals/tensor/{multi_solana_cancel_nft_listing_eval.ts => multi_solana_cancel_nft_listing.eval.ts} (100%) rename src/langchain/evals/tensor/{multi_solana_list_nft_for_sale_eval.ts => multi_solana_list_nft_for_sale.eval.ts} (100%) rename src/langchain/evals/tiplink/{multi_solana_tiplink_eval.ts => multi_solana_tiplink.eval.ts} (100%) diff --git a/src/langchain/evals/gibwork/multi_create_gibwork_task_eval.ts b/src/langchain/evals/gibwork/multi_create_gibwork_task.eval.ts similarity index 100% rename from src/langchain/evals/gibwork/multi_create_gibwork_task_eval.ts rename to src/langchain/evals/gibwork/multi_create_gibwork_task.eval.ts diff --git a/src/langchain/evals/jupiter/multi_token_data_eval.ts b/src/langchain/evals/jupiter/multi_token_data.eval.ts similarity index 100% rename from src/langchain/evals/jupiter/multi_token_data_eval.ts rename to src/langchain/evals/jupiter/multi_token_data.eval.ts diff --git a/src/langchain/evals/jupiter/multi_token_swap_eval.ts b/src/langchain/evals/jupiter/multi_token_swap.eval.ts similarity index 100% rename from src/langchain/evals/jupiter/multi_token_swap_eval.ts rename to src/langchain/evals/jupiter/multi_token_swap.eval.ts diff --git a/src/langchain/evals/metaplex/multi_solana_deploy_collection_eval.ts b/src/langchain/evals/metaplex/multi_solana_deploy_collection.eval.ts similarity index 100% rename from src/langchain/evals/metaplex/multi_solana_deploy_collection_eval.ts rename to src/langchain/evals/metaplex/multi_solana_deploy_collection.eval.ts diff --git a/src/langchain/evals/multi/basics.evals.ts b/src/langchain/evals/multi/multi_basics.eval.ts similarity index 100% rename from src/langchain/evals/multi/basics.evals.ts rename to src/langchain/evals/multi/multi_basics.eval.ts diff --git a/src/langchain/evals/openbook/multi_solana_openbook_create_market_eval.ts b/src/langchain/evals/openbook/multi_solana_openbook_create.market_eval.ts similarity index 100% rename from src/langchain/evals/openbook/multi_solana_openbook_create_market_eval.ts rename to src/langchain/evals/openbook/multi_solana_openbook_create.market_eval.ts diff --git a/src/langchain/evals/pumpfun/multi_pumpfun_token_launch_eval.ts b/src/langchain/evals/pumpfun/multi_pumpfun_token_launch.eval.ts similarity index 100% rename from src/langchain/evals/pumpfun/multi_pumpfun_token_launch_eval.ts rename to src/langchain/evals/pumpfun/multi_pumpfun_token_launch.eval.ts diff --git a/src/langchain/evals/solana/multi_balance_other_eval.ts b/src/langchain/evals/solana/multi_balance_other.eval.ts similarity index 100% rename from src/langchain/evals/solana/multi_balance_other_eval.ts rename to src/langchain/evals/solana/multi_balance_other.eval.ts diff --git a/src/langchain/evals/solana/multi_solana_restake_eval.ts b/src/langchain/evals/solana/multi_solana_restake.eval.ts similarity index 100% rename from src/langchain/evals/solana/multi_solana_restake_eval.ts rename to src/langchain/evals/solana/multi_solana_restake.eval.ts diff --git a/src/langchain/evals/solana/multi_transfer_eval.ts b/src/langchain/evals/solana/multi_transfer.eval.ts similarity index 100% rename from src/langchain/evals/solana/multi_transfer_eval.ts rename to src/langchain/evals/solana/multi_transfer.eval.ts diff --git a/src/langchain/evals/tensor/multi_solana_cancel_nft_listing_eval.ts b/src/langchain/evals/tensor/multi_solana_cancel_nft_listing.eval.ts similarity index 100% rename from src/langchain/evals/tensor/multi_solana_cancel_nft_listing_eval.ts rename to src/langchain/evals/tensor/multi_solana_cancel_nft_listing.eval.ts diff --git a/src/langchain/evals/tensor/multi_solana_list_nft_for_sale_eval.ts b/src/langchain/evals/tensor/multi_solana_list_nft_for_sale.eval.ts similarity index 100% rename from src/langchain/evals/tensor/multi_solana_list_nft_for_sale_eval.ts rename to src/langchain/evals/tensor/multi_solana_list_nft_for_sale.eval.ts diff --git a/src/langchain/evals/tiplink/multi_solana_tiplink_eval.ts b/src/langchain/evals/tiplink/multi_solana_tiplink.eval.ts similarity index 100% rename from src/langchain/evals/tiplink/multi_solana_tiplink_eval.ts rename to src/langchain/evals/tiplink/multi_solana_tiplink.eval.ts diff --git a/src/langchain/evals/utils/runEvals.ts b/src/langchain/evals/utils/runEvals.ts index 5c84bed74..6af09d4b9 100644 --- a/src/langchain/evals/utils/runEvals.ts +++ b/src/langchain/evals/utils/runEvals.ts @@ -189,78 +189,82 @@ export async function runComplexEval( tool_calls?: any; }> = []; let foundCorrectToolCall = true; - - for (let i = 0; i < scenario.turns.length; i++) { - const turn = scenario.turns[i]; - conversation.push({ role: "user", content: turn.input }); - - const result = await agent.invoke( - { messages: conversation }, - { - configurable: { - thread_id: `${testName}-${new Date().toISOString()}`, // Need unique thread-id to keep context seperate betweet tests + try { + for (let i = 0; i < scenario.turns.length; i++) { + const turn = scenario.turns[i]; + conversation.push({ role: "user", content: turn.input }); + + const result = await agent.invoke( + { messages: conversation }, + { + configurable: { + thread_id: `${testName}-${new Date().toISOString()}`, // Need unique thread-id to keep context seperate betweet tests + }, }, - }, - ); + ); - ls.logOutputs(result); - const assistantMessage = result.messages[result.messages.length - 1]; - conversation.push(assistantMessage); - // conversation.forEach((message) => console.log(message.content)); - if ( - turn.expectedToolCall && - !( + ls.logOutputs(result); + const assistantMessage = result.messages[result.messages.length - 1]; + conversation.push(assistantMessage); + if ( + turn.expectedToolCall && + !( + assistantMessage.tool_calls && + assistantMessage.tool_calls.length > 0 + ) + ) { + foundCorrectToolCall = false; + continue; + } + if ( assistantMessage.tool_calls && - assistantMessage.tool_calls.length > 0 - ) - ) { - foundCorrectToolCall = false; - continue; - } - if ( - assistantMessage.tool_calls && - assistantMessage.tool_calls.length > 0 && - turn.expectedToolCall - ) { - const toolCall = assistantMessage.tool_calls[0]; - - const toolName = toolCall?.name || ""; - const llmArgs = toolCall.args.input; - const toolArgs: string = typeof llmArgs === "string" ? llmArgs : "{}"; - const params = turn.expectedToolCall.params; - - if (toolName === turn.expectedToolCall.tool) { - const referenceOutputs = { - tool: turn.expectedToolCall.tool, - response: - typeof params === "string" - ? params - : JSON.stringify(turn.expectedToolCall.params), - }; - const llmAnswer: { tool: string; response: string } = { - tool: toolName, - response: toolArgs, - }; - - const argsMatch = compareArgs(referenceOutputs, llmAnswer); - const toolMatches = compareTools(referenceOutputs, llmAnswer); - - foundCorrectToolCall = - foundCorrectToolCall && argsMatch && toolMatches; // && so if it fails on one tool the whole test fails - - const wrappedToolEvaluator = ls.wrapEvaluator(toolEvaluator); - await wrappedToolEvaluator({ - referenceOutputs, - llmAnswer, - }); - - const wrappedArgsEvaluator = ls.wrapEvaluator(argsEvaluator); - await wrappedArgsEvaluator({ - referenceOutputs, - llmAnswer, - }); + assistantMessage.tool_calls.length > 0 && + turn.expectedToolCall + ) { + const toolCall = assistantMessage.tool_calls[0]; + + const toolName = toolCall?.name || ""; + const llmArgs = toolCall.args.input; + const toolArgs: string = + typeof llmArgs === "string" ? llmArgs : "{}"; + const params = turn.expectedToolCall.params; + + if (toolName === turn.expectedToolCall.tool) { + const referenceOutputs = { + tool: turn.expectedToolCall.tool, + response: + typeof params === "string" + ? params + : JSON.stringify(turn.expectedToolCall.params), + }; + const llmAnswer: { tool: string; response: string } = { + tool: toolName, + response: toolArgs, + }; + + const argsMatch = compareArgs(referenceOutputs, llmAnswer); + const toolMatches = compareTools(referenceOutputs, llmAnswer); + + foundCorrectToolCall = + foundCorrectToolCall && argsMatch && toolMatches; // && so if it fails on one tool the whole test fails + + const wrappedToolEvaluator = ls.wrapEvaluator(toolEvaluator); + await wrappedToolEvaluator({ + referenceOutputs, + llmAnswer, + }); + + const wrappedArgsEvaluator = ls.wrapEvaluator(argsEvaluator); + await wrappedArgsEvaluator({ + referenceOutputs, + llmAnswer, + }); + } } } + } catch (err) { + console.error(err); + conversation.forEach((message) => console.log(message.content)); } expect(foundCorrectToolCall).toBe(true); }); From 55155210f5a0aa9eed8fc19dc2be6260d1a7de21 Mon Sep 17 00:00:00 2001 From: caizer0x Date: Wed, 9 Apr 2025 12:13:53 -0400 Subject: [PATCH 3/4] expectedResponses validation and more evals --- .../gibwork/multi_create_gibwork_task.eval.ts | 18 +- .../evals/jupiter/multi_token_data.eval.ts | 5 +- .../evals/jupiter/multi_token_swap.eval.ts | 10 +- .../multi_solana_deploy_collection.eval.ts | 13 +- .../evals/multi/multi_basics.eval.ts | 154 ++++++++++++++++-- ...ulti_solana_openbook_create.market_eval.ts | 16 +- .../multi_pumpfun_token_launch.eval.ts | 17 +- .../evals/solana/multi_balance_other.eval.ts | 19 ++- .../evals/solana/multi_solana_restake.eval.ts | 29 +++- .../evals/solana/multi_transfer.eval.ts | 101 +++++++++++- .../multi_solana_cancel_nft_listing.eval.ts | 9 +- src/langchain/evals/utils/runEvals.ts | 43 +++++ 12 files changed, 389 insertions(+), 45 deletions(-) diff --git a/src/langchain/evals/gibwork/multi_create_gibwork_task.eval.ts b/src/langchain/evals/gibwork/multi_create_gibwork_task.eval.ts index d5c0d8157..62b6c34b3 100644 --- a/src/langchain/evals/gibwork/multi_create_gibwork_task.eval.ts +++ b/src/langchain/evals/gibwork/multi_create_gibwork_task.eval.ts @@ -7,8 +7,15 @@ const DATASET: ComplexEvalDataset[] = [ query: "I need to create a new Gibwork task", }, turns: [ - { input: "I need to create a new Gibwork task" }, - { input: "The task is titled 'Fix my website'" }, + { + input: "I need to create a new Gibwork task", + expectedResponse: "Sure, please provide the task title or details.", + }, + { + input: "The task is titled 'Fix my website'", + expectedResponse: + "Understood, 'Fix my website' is the task title. Any more details?", + }, { input: "Also, what's the current price of JUP?", expectedToolCall: { @@ -16,7 +23,12 @@ const DATASET: ComplexEvalDataset[] = [ params: "JUPyiwrYJFskUPiHa7hkeR8VUtAeFoSYbKedZNsDvCN", }, }, - { input: "It should be for 1000 JUP tokens with no extra content" }, + { + input: + "The Gibwork job should be for 1000 JUP tokens with no extra content.", + expectedResponse: + "Okay, 1000 JUP tokens, no additional content. Any requirements or tags?", + }, { input: "Set content and requirements to N/A and tag it as webdev", expectedToolCall: { diff --git a/src/langchain/evals/jupiter/multi_token_data.eval.ts b/src/langchain/evals/jupiter/multi_token_data.eval.ts index 387517760..7994548a3 100644 --- a/src/langchain/evals/jupiter/multi_token_data.eval.ts +++ b/src/langchain/evals/jupiter/multi_token_data.eval.ts @@ -7,7 +7,10 @@ const DATASET: ComplexEvalDataset[] = [ query: "What's the price of KING?", }, turns: [ - { input: "What's the price of KING?" }, + { + input: "What's the price of KING?", + expectedResponse: "Sure, can you provide the mint address of KING?", + }, { input: "The mint address is 5eqNDjbsWL9hfAqUfhegTxgEa3XardzGdVAboMA4pump", diff --git a/src/langchain/evals/jupiter/multi_token_swap.eval.ts b/src/langchain/evals/jupiter/multi_token_swap.eval.ts index 8d9e9eaaa..0678f74b1 100644 --- a/src/langchain/evals/jupiter/multi_token_swap.eval.ts +++ b/src/langchain/evals/jupiter/multi_token_swap.eval.ts @@ -7,8 +7,14 @@ const DATASET: ComplexEvalDataset[] = [ query: "I want to swap some tokens", }, turns: [ - { input: "I want to swap some tokens" }, - { input: "I want to exchange USDC for JUP tokens" }, + { + input: "I want to swap some tokens", + expectedResponse: "Sure, which tokens would you like to swap?", + }, + { + input: "I want to exchange USDC for JUP tokens", + expectedResponse: "How much USDC?", + }, { input: "Swap 10 USDC for JUP with 1% slippage", expectedToolCall: { diff --git a/src/langchain/evals/metaplex/multi_solana_deploy_collection.eval.ts b/src/langchain/evals/metaplex/multi_solana_deploy_collection.eval.ts index 2bd993753..29d437925 100644 --- a/src/langchain/evals/metaplex/multi_solana_deploy_collection.eval.ts +++ b/src/langchain/evals/metaplex/multi_solana_deploy_collection.eval.ts @@ -7,8 +7,15 @@ const DATASET: ComplexEvalDataset[] = [ query: "I want to deploy an NFT collection", }, turns: [ - { input: "I want to deploy an NFT collection" }, - { input: "The collection should be named MyCollection" }, + { + input: "I want to deploy an NFT collection", + expectedResponse: + "Sure, what's the name of your collection? I also need the metadata URI and royalty basis points.", + }, + { + input: "The collection should be named MyCollection", + expectedResponse: "Got it. Metadata URI and royalty basis points?", + }, { input: "Its metadata URI is https://metadata.mycoll.io/collection.json. Set the royalty to 250 basis points", @@ -24,7 +31,7 @@ const DATASET: ComplexEvalDataset[] = [ { input: "Also, retrieve the deployed collection details", expectedToolCall: { - tool: "get_asset", + tool: "solana_get_asset", params: { collection: "MyCollection" }, }, }, diff --git a/src/langchain/evals/multi/multi_basics.eval.ts b/src/langchain/evals/multi/multi_basics.eval.ts index 362a12107..96c7dabb6 100644 --- a/src/langchain/evals/multi/multi_basics.eval.ts +++ b/src/langchain/evals/multi/multi_basics.eval.ts @@ -132,6 +132,56 @@ const DATASET: ComplexEvalDataset[] = [ }, ], }, + { + description: "Multi-turn flow: Check balance when asked to mint NFT.", + inputs: { + query: "Mint NFT if there is balance.", + }, + turns: [ + { + input: + "Mint an NFT with name 'MyFirstNFT' and symbol 'MFN', uri: https://example.com/nft.json.", + expectedToolCall: { + // Should check balance before trying to mint + tool: "solana_balance", + params: {}, + }, + }, + { + input: "Try minting the NFTs anyways", + expectedToolCall: { + // since the user said so, should try minting with zero balance and get an error + tool: "solana_mint_nft", + params: { + name: "MyFirstNFT", + symbol: "MFN", + uri: "https://example.com/nft.json", + }, + }, + }, + ], + }, + { + description: "Multi-turn flow: Mint NFT without balance", + inputs: { + query: "Mint NFT if there is balance.", + }, + turns: [ + { + input: + "Mint an NFT with name 'MyFirstNFT' and symbol 'MFN', uri: https://example.com/nft.json.", + expectedToolCall: { + // will produce an error since the balance should be zero + tool: "solana_mint_nft", + params: { + name: "MyFirstNFT", + symbol: "MFN", + uri: "https://example.com/nft.json", + }, + }, + }, + ], + }, { description: "Multi-turn flow: Create multisig, deposit SOL", inputs: { @@ -186,16 +236,16 @@ const DATASET: ComplexEvalDataset[] = [ input: "Now check my USDC balance", expectedToolCall: { tool: "solana_balance", - params: JSON.stringify({ + params: { tokenAddres: "EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v", - }), + }, }, }, { input: "Stake 0.5 SOL", expectedToolCall: { tool: "solana_stake", - params: JSON.stringify({ amount: 0.5 }), + params: { amount: 0.5 }, }, }, ], @@ -258,9 +308,9 @@ const DATASET: ComplexEvalDataset[] = [ input: "How much of that token do I hold?", expectedToolCall: { tool: "solana_balance", - params: JSON.stringify({ + params: { tokenAddress: "JUPyiwrYJFskUPiHa7hkeR8VUtAeFoSYbKedZNsDvCN", - }), + }, }, }, { @@ -288,9 +338,9 @@ const DATASET: ComplexEvalDataset[] = [ "How many tokens do I have at mint EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v?", expectedToolCall: { tool: "solana_balance", - params: JSON.stringify({ + params: { tokenAddress: "EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v", - }), + }, }, }, { @@ -321,9 +371,9 @@ const DATASET: ComplexEvalDataset[] = [ input: "Check how many tokens I have left", expectedToolCall: { tool: "solana_balance", - params: JSON.stringify({ + params: { tokenAddress: "EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v", - }), + }, }, }, ], @@ -352,7 +402,7 @@ const DATASET: ComplexEvalDataset[] = [ input: "Now stake 1 SOL", expectedToolCall: { tool: "solana_stake", - params: JSON.stringify({ amount: 1 }), + params: { amount: 1 }, }, }, { @@ -397,4 +447,86 @@ const DATASET: ComplexEvalDataset[] = [ }, ]; -runComplexEval(DATASET, "Multi-turn Basic User Flows test"); +async function clusterAwarenessEval() { + const devnet = { + description: "Multi-turn flow: if on devnet request funds from faucet", + inputs: { + query: "Check if on devnet", + }, + turns: [ + { + input: "Are you connected to the Solana mainnet or devnet?", + expectedResponse: "I am connected to the devnet.", + }, + { + input: "Request 2 SOL from the faucet", + expectedToolCall: { + tool: "solana_request_funds", + params: "{}", + }, + }, + { + input: "Check how much SOL I have now", + expectedToolCall: { + tool: "solana_balance", + params: "{}", + }, + }, + ], + }; + const mainnet = { + description: + "Multi-turn flow: if on mainnet requesting faucet funds should fail", + inputs: { + query: "Check if on mainnet", + }, + turns: [ + { + input: "Are you connected to the Solana mainnet or devnet?", + expectedResponse: "I am connected to the mainnet.", + }, + { + input: "Request 2 SOL from the faucet", + expectedResponse: "I cannot request funds from the faucet on mainnet.", + }, + ], + }; + + const rpc = process.env.RPC_URL || "https://api.devnet.solana.com"; + try { + const response = await fetch(rpc, { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ + jsonrpc: "2.0", + id: 1, + method: "getGenesisHash", + }), + }); + + const data = await response.json(); + const genesisHash = data.result; + + const mainnetHash = "5eykt4UsFv8P8NJdTREpY1vzqKqZKvdpKuc147dw2N9d"; + const devnetHash = "EtWTRABZaYq6iMfeYKouRu166VU2xqa1wcaWoxPkrZBG"; + + if (genesisHash === mainnetHash) { + return mainnet; + } else { + return devnet; + } + } catch (error) { + console.error("Error checking cluster:", error); + return devnet; + } +} + +async function runEvaluations() { + const clusterEval = await clusterAwarenessEval(); + const updatedDataset = [...DATASET, clusterEval]; + runComplexEval(updatedDataset, "Multi-turn Basic User Flows Evals"); +} + +runEvaluations(); diff --git a/src/langchain/evals/openbook/multi_solana_openbook_create.market_eval.ts b/src/langchain/evals/openbook/multi_solana_openbook_create.market_eval.ts index fb4770103..ba4cdd529 100644 --- a/src/langchain/evals/openbook/multi_solana_openbook_create.market_eval.ts +++ b/src/langchain/evals/openbook/multi_solana_openbook_create.market_eval.ts @@ -7,9 +7,19 @@ const DATASET: ComplexEvalDataset[] = [ query: "I need to create a new openbook market", }, turns: [ - { input: "I need to create a new openbook market" }, - { input: "Let’s use SOL as the base mint" }, - { input: "And USDC as the quote mint" }, + { + input: "I need to create a new openbook market", + expectedResponse: "Sure, what would be the base mint?", + }, + { + input: "Let’s use SOL as the base mint", + expectedResponse: + "Got it, SOL is the base mint. What about the quote mint?", + }, + { + input: "And USDC as the quote mint", + expectedResponse: "USDC as the quote mint. Any lot size or tick size?", + }, { input: "Set the lot size to 100 and tick size to 1.5", expectedToolCall: { diff --git a/src/langchain/evals/pumpfun/multi_pumpfun_token_launch.eval.ts b/src/langchain/evals/pumpfun/multi_pumpfun_token_launch.eval.ts index f4d5129e4..99ce43a76 100644 --- a/src/langchain/evals/pumpfun/multi_pumpfun_token_launch.eval.ts +++ b/src/langchain/evals/pumpfun/multi_pumpfun_token_launch.eval.ts @@ -7,9 +7,20 @@ const DATASET: ComplexEvalDataset[] = [ query: "I want to launch a new PumpFun token", }, turns: [ - { input: "I want to launch a new PumpFun token" }, - { input: "I want it to be called YOLO" }, - { input: "The ticker should be YOLO and description 'yolo token'" }, + { + input: "I want to launch a new PumpFun token", + expectedResponse: + "Sure, I will need a token name, ticker, image and description.", + }, + { + input: "I want it to be called YOLO", + expectedResponse: + "Okay, YOLO is the name. What about the ticker, description and image?", + }, + { + input: "The ticker should be YOLO and description 'yolo token'", + expectedResponse: "Great. Do you have an image URL?", + }, { input: "Use the image URL https://example.com/yolo.png", expectedToolCall: { diff --git a/src/langchain/evals/solana/multi_balance_other.eval.ts b/src/langchain/evals/solana/multi_balance_other.eval.ts index 506105363..ddaa7705c 100644 --- a/src/langchain/evals/solana/multi_balance_other.eval.ts +++ b/src/langchain/evals/solana/multi_balance_other.eval.ts @@ -7,7 +7,10 @@ const DATASET: ComplexEvalDataset[] = [ query: "I want to check my friend's balance", }, turns: [ - { input: "Check my friend's USDC balance" }, + { + input: "Check my friend's USDC balance", + expectedResponse: "Sure, what's their wallet address?", + }, { input: "The wallet address is GZbQmKYYzwjP3nbdqRWPLn98ipAni9w5eXMGp7bmZbGB", @@ -21,7 +24,19 @@ const DATASET: ComplexEvalDataset[] = [ }, { input: "Also, check my SOL balance", - expectedToolCall: { tool: "solana_balance", params: {} }, + expectedToolCall: { + tool: "solana_balance", + params: {}, + }, + }, + { + input: "Whats my USDC balance?", + expectedToolCall: { + tool: "solana_balance", + params: { + tokenAddress: "EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v", + }, + }, }, ], }, diff --git a/src/langchain/evals/solana/multi_solana_restake.eval.ts b/src/langchain/evals/solana/multi_solana_restake.eval.ts index f00c858be..7a9171175 100644 --- a/src/langchain/evals/solana/multi_solana_restake.eval.ts +++ b/src/langchain/evals/solana/multi_solana_restake.eval.ts @@ -4,14 +4,29 @@ const DATASET: ComplexEvalDataset[] = [ { description: "Multi-turn restake SOL", inputs: { - query: "I want to restake my SOL" + query: "I want to restake my SOL", }, turns: [ - { input: "I want to restake my SOL" }, - { input: "Please restake 1.5 SOL for me", expectedToolCall: { tool: "solana_restake", params: { amount: 1.5 } } }, - { input: "Then check my updated SOL balance", expectedToolCall: { tool: "solana_balance", params: {} } } - ] - } + { + input: "I want to restake my SOL", + expectedResponse: "Sure, how much SOL would you like to restake?", + }, + { + input: "Please restake 1.5 SOL for me", + expectedToolCall: { + tool: "solana_restake", + params: { amount: 1.5 }, + }, + }, + { + input: "Then check my updated SOL balance", + expectedToolCall: { + tool: "solana_balance", + params: {}, + }, + }, + ], + }, ]; -runComplexEval(DATASET, "Multi-turn Restake test"); \ No newline at end of file +runComplexEval(DATASET, "Multi-turn Restake test"); diff --git a/src/langchain/evals/solana/multi_transfer.eval.ts b/src/langchain/evals/solana/multi_transfer.eval.ts index 5e78677a9..0503a51b8 100644 --- a/src/langchain/evals/solana/multi_transfer.eval.ts +++ b/src/langchain/evals/solana/multi_transfer.eval.ts @@ -4,14 +4,101 @@ const DATASET: ComplexEvalDataset[] = [ { description: "Multi-turn SOL transfer", inputs: { - query: "I want to send some SOL" + query: "I want to send some SOL", }, turns: [ - { input: "I want to send some SOL" }, - { input: "Please transfer 0.05 SOL" }, - { input: "Send it to wallet GZbQmKYYzwjP3nbdqRWPLn98ipAni9w5eXMGp7bmZbGB", expectedToolCall: { tool: "solana_transfer", params: { to: "GZbQmKYYzwjP3nbdqRWPLn98ipAni9w5eXMGp7bmZbGB", amount: 0.05 } } } - ] - } + { + input: "I want to send some SOL", + expectedResponse: + "Sure, how much SOL and what is the recipient address?", + }, + { + input: "Please transfer 0.05 SOL", + expectedResponse: "Alright, to which address?", + }, + { + input: "Send it to wallet GZbQmKYYzwjP3nbdqRWPLn98ipAni9w5eXMGp7bmZbGB", + expectedToolCall: { + tool: "solana_transfer", + params: { + to: "GZbQmKYYzwjP3nbdqRWPLn98ipAni9w5eXMGp7bmZbGB", + amount: 0.05, + }, + }, + }, + ], + }, + { + description: "Multi-turn SOL transfer", + inputs: { + query: "Send large SOL amount", + }, + turns: [ + { + input: "I want to send 1000 SOL", + expectedResponse: "Sure, what is the recipient address?", + }, + { + input: "Send it to wallet GZbQmKYYzwjP3nbdqRWPLn98ipAni9w5eXMGp7bmZbGB", + expectedToolCall: { + tool: "solana_transfer", + params: { + to: "GZbQmKYYzwjP3nbdqRWPLn98ipAni9w5eXMGp7bmZbGB", + amount: 1000, + }, + }, + }, + ], + }, + { + description: "Multi-turn SOL transfer", + inputs: { + query: "Send more SOL than in balance", + }, + turns: [ + { + input: "Check my balance of SOL", + expectedToolCall: { + tool: "solana_balance", + params: {}, + }, + }, + { + input: + "Send twice my balance to GZbQmKYYzwjP3nbdqRWPLn98ipAni9w5eXMGp7bmZbGB", + expectedToolCall: { + tool: "", // should be no tool call since you can't transfer 2x balance + params: {}, + }, + }, + ], + }, + { + description: "Multi-turn SOL transfer", + inputs: { + query: "Send more SOL than in balance", + }, + turns: [ + { + input: + "Check my friends SOL balance, his address is: zZNEUiAq2kLgWFJiZofHfcar91ph7yE2nUfLmXswkvP", + expectedToolCall: { + tool: "solana_balance_other", + params: { + walletAddress: "zZNEUiAq2kLgWFJiZofHfcar91ph7yE2nUfLmXswkvP", + }, + }, + }, + { + input: + "Transfer from him to my address: GZbQmKYYzwjP3nbdqRWPLn98ipAni9w5eXMGp7bmZbGB", + expectedToolCall: { + tool: "", // should be no tool call since the user is asking to transfer from non owned account. + params: {}, + }, + }, + ], + }, ]; -runComplexEval(DATASET, "Multi-turn Transfer test"); \ No newline at end of file +runComplexEval(DATASET, "Multi-turn Transfer test"); diff --git a/src/langchain/evals/tensor/multi_solana_cancel_nft_listing.eval.ts b/src/langchain/evals/tensor/multi_solana_cancel_nft_listing.eval.ts index 99a719b0d..35a76c2a2 100644 --- a/src/langchain/evals/tensor/multi_solana_cancel_nft_listing.eval.ts +++ b/src/langchain/evals/tensor/multi_solana_cancel_nft_listing.eval.ts @@ -7,12 +7,15 @@ const DATASET: ComplexEvalDataset[] = [ query: "I need to cancel my NFT listing", }, turns: [ - { input: "I need to cancel my NFT listing" }, { - input: "Cancel the listing for my NFT with mint 4KG7k12", + input: "I need to cancel my NFT listing", + expectedResponse: "Please provide the mint address of your NFT.", + }, + { + input: "zZNEUiAq2kLgWFJiZofHfcar91ph7yE2nUfLmXswkvP", expectedToolCall: { tool: "solana_cancel_nft_listing", - params: { nftMint: "4KG7k12" }, + params: { nftMint: "zZNEUiAq2kLgWFJiZofHfcar91ph7yE2nUfLmXswkvP" }, }, }, ], diff --git a/src/langchain/evals/utils/runEvals.ts b/src/langchain/evals/utils/runEvals.ts index 6af09d4b9..c54ed4ec7 100644 --- a/src/langchain/evals/utils/runEvals.ts +++ b/src/langchain/evals/utils/runEvals.ts @@ -99,6 +99,38 @@ const argsEvaluator = async (params: { }; }; +// import dotenv from "dotenv"; +// dotenv.config(); + +// Compare actual response with the expectedResponse using a LLM +async function responseEvaluator( + expectedResponse: string, + actualResponse: string, +) { + const systemPrompt = `Compare the two strings the user gives you. Are they completely different? If completely different, return "false", else return "true". Return only those words. + Example: + Expected: Sure, which base mint? + Actual: Please give me the mint address for the base token. + + Response: "true" + `; + + const userPrompt = `Expected: ${expectedResponse} +Actual: ${actualResponse}`; + + const result = await llm.invoke([ + { role: "system", content: systemPrompt }, + { role: "user", content: userPrompt }, + ]); + console.log({ userPrompt, result: result.content }); + const content = result.content as string; + + return content.toLowerCase().includes("true"); +} + +/** + * Runs single-turn eval for basic function calls + */ export async function runEvals( dataset: { inputs: { query: string }; @@ -165,6 +197,7 @@ export async function runEvals( export type ConversationTurn = { input: string; + expectedResponse?: string; expectedToolCall?: { tool: string; params: any; @@ -206,6 +239,16 @@ export async function runComplexEval( ls.logOutputs(result); const assistantMessage = result.messages[result.messages.length - 1]; conversation.push(assistantMessage); + + if (turn.expectedResponse) { + foundCorrectToolCall = + foundCorrectToolCall && + (await responseEvaluator( + turn.expectedResponse, + assistantMessage.content, + )); + } + if ( turn.expectedToolCall && !( From efc09cc64de0b5ab5dc1c92443116591e67f2e85 Mon Sep 17 00:00:00 2001 From: caizer0x Date: Wed, 9 Apr 2025 14:11:42 -0400 Subject: [PATCH 4/4] remove console log --- src/langchain/evals/utils/runEvals.ts | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/langchain/evals/utils/runEvals.ts b/src/langchain/evals/utils/runEvals.ts index c54ed4ec7..c7cc114ee 100644 --- a/src/langchain/evals/utils/runEvals.ts +++ b/src/langchain/evals/utils/runEvals.ts @@ -99,9 +99,6 @@ const argsEvaluator = async (params: { }; }; -// import dotenv from "dotenv"; -// dotenv.config(); - // Compare actual response with the expectedResponse using a LLM async function responseEvaluator( expectedResponse: string, @@ -122,7 +119,6 @@ Actual: ${actualResponse}`; { role: "system", content: systemPrompt }, { role: "user", content: userPrompt }, ]); - console.log({ userPrompt, result: result.content }); const content = result.content as string; return content.toLowerCase().includes("true");