1
0
Fork 0
mirror of https://github.com/maybe-finance/maybe.git synced 2025-08-09 23:45:21 +02:00

Batch AI auto-categorization of transactions

This commit is contained in:
Zach Gollwitzer 2025-04-15 20:09:07 -04:00
parent bc7b96863f
commit a1cb17c0da
16 changed files with 550 additions and 22 deletions

View file

@ -23,6 +23,7 @@ export default class extends Controller {
remove(e) {
if (e.params.destroy) {
this.destroyFieldTarget.value = true;
this.element.classList.add("hidden");
} else {
this.element.remove();
}
@ -77,6 +78,9 @@ export default class extends Controller {
const textInput = this.#convertFormFieldTo("input", this.valueInputEl);
textInput.placeholder = "Enter a value";
textInput.type = conditionFilter.type; // "text" || "number"
if (conditionFilter.type === "number") {
textInput.step = conditionFilter.number_step;
}
this.valueInputEl.replaceWith(textInput);
}

View file

@ -34,6 +34,10 @@ class Entry < ApplicationRecord
)
}
def classification
amount.negative? ? "income" : "expense"
end
def lock_saved_attributes!
super
entryable.lock_saved_attributes!

View file

@ -1,6 +1,12 @@
module Provider::LlmConcept
extend ActiveSupport::Concern
AutoCategorization = Data.define(:transaction_id, :category_name)
def auto_categorize(transactions)
raise NotImplementedError, "Subclasses must implement #auto_categorize"
end
ChatMessage = Data.define(:id, :output_text)
ChatStreamChunk = Data.define(:type, :data)
ChatResponse = Data.define(:id, :model, :messages, :function_requests)

View file

@ -4,7 +4,7 @@ class Provider::Openai < Provider
# Subclass so errors caught in this provider are raised as Provider::Openai::Error
Error = Class.new(Provider::Error)
MODELS = %w[gpt-4o]
MODELS = %w[gpt-4o gpt-4o-mini]
def initialize(access_token)
@client = ::OpenAI::Client.new(access_token: access_token)
@ -14,6 +14,18 @@ class Provider::Openai < Provider
MODELS.include?(model)
end
def auto_categorize(transactions: [], user_categories: [])
with_provider_response do
raise Error, "Too many transactions to auto-categorize. Max is 100 per request" if transactions.size > 100
AutoCategorizer.new(
client,
transactions: transactions,
user_categories: user_categories
).auto_categorize
end
end
def chat_response(prompt, model:, instructions: nil, functions: [], function_results: [], streamer: nil, previous_response_id: nil)
with_provider_response do
chat_config = ChatConfig.new(

View file

@ -0,0 +1,118 @@
class Provider::Openai::AutoCategorizer
def initialize(client, transactions: [], user_categories: [])
@client = client
@transactions = transactions
@user_categories = user_categories
end
def auto_categorize
response = client.responses.create(parameters: {
model: "gpt-4o-mini",
input: [ { role: "developer", content: developer_message } ],
text: {
format: {
type: "json_schema",
name: "auto_categorize_personal_finance_transactions",
strict: true,
schema: json_schema
}
},
instructions: instructions
})
build_response(extract_categorizations(response))
end
private
attr_reader :client, :transactions, :user_categories
AutoCategorization = Provider::LlmConcept::AutoCategorization
def build_response(categorizations)
categorizations.map do |categorization|
AutoCategorization.new(
transaction_id: categorization.dig("transaction_id"),
category_name: normalize_category_name(categorization.dig("category_name")),
)
end
end
def normalize_category_name(category_name)
return nil if category_name == "null"
category_name
end
def extract_categorizations(response)
response_json = JSON.parse(response.dig("output")[0].dig("content")[0].dig("text"))
response_json.dig("categorizations")
end
def json_schema
{
type: "object",
properties: {
categorizations: {
type: "array",
description: "An array of auto-categorizations for each transaction",
items: {
type: "object",
properties: {
transaction_id: {
type: "string",
description: "The internal ID of the original transaction",
enum: transactions.map { |t| t[:id] }
},
category_name: {
type: "string",
description: "The matched category name of the transaction, or null if no match",
enum: [ *user_categories.map { |c| c[:name] }, "null" ]
}
},
required: [ "transaction_id", "category_name" ],
additionalProperties: false
}
}
},
required: [ "categorizations" ],
additionalProperties: false
}
end
def developer_message
<<~MESSAGE.strip_heredoc
Here are the user's available categories in JSON format:
```json
#{user_categories.to_json}
```
Use the available categories to auto-categorize the following transactions:
```json
#{transactions.to_json}
```
MESSAGE
end
def instructions
<<~INSTRUCTIONS.strip_heredoc
You are an assistant to a consumer personal finance app. You will be provided a list
of the user's transactions and a list of the user's categories. Your job is to auto-categorize
each transaction.
Closely follow ALL the rules below while auto-categorizing:
- Return 1 result per transaction
- Correlate each transaction by ID (transaction_id)
- Attempt to match the most specific category possible (i.e. subcategory over parent category)
- Category and transaction classifications should match (i.e. if transaction is an "expense", the category must have classification of "expense")
- If you don't know the category, return "null"
- You should always favor "null" over false positives
- Be slightly pessimistic. Only match a category if you're 60%+ confident it is the correct one.
- Each transaction has varying metadata that can be used to determine the category
- Note: "hint" comes from 3rd party aggregators and typically represents a category name that
may or may not match any of the user-supplied categories
INSTRUCTIONS
end
end

View file

@ -1,4 +1,6 @@
class Rule < ApplicationRecord
include Provided
UnsupportedResourceTypeError = Class.new(StandardError)
belongs_to :family
@ -11,8 +13,8 @@ class Rule < ApplicationRecord
validates :resource_type, presence: true
validate :no_nested_compound_conditions
# Every rule must have at least 1 condition + action
validate :min_conditions_and_actions
# Every rule must have at least 1 action
validate :min_actions
validate :no_duplicate_actions
def action_executors
@ -67,11 +69,7 @@ class Rule < ApplicationRecord
scope
end
def min_conditions_and_actions
if conditions.reject(&:marked_for_destruction?).empty?
errors.add(:base, "must have at least one condition")
end
def min_actions
if actions.reject(&:marked_for_destruction?).empty?
errors.add(:base, "must have at least one action")
end

View file

@ -1,5 +1,104 @@
class Rule::ActionExecutor::AiAutoCategorize < Rule::ActionExecutor
ProviderMissingError = Class.new(StandardError)
def execute(transaction_scope, value: nil, ignore_attribute_locks: false)
# TODO
raise ProviderMissingError, "LLM provider is not configured" unless llm_provider.present?
enrichable_transactions = transaction_scope.enrichable(:category_id).where(category_id: nil).includes(:category, :merchant, :entry)
if enrichable_transactions.none?
Rails.logger.info("No transactions to auto-categorize for rule #{rule.id}")
return
else
Rails.logger.info("Auto-categorizing #{enrichable_transactions.count} transactions for rule #{rule.id}")
end
consecutive_failures = 0
total_transactions = enrichable_transactions.count
batch_size = 100
total_batches = (total_transactions.to_f / batch_size).ceil
batch_index = 0
enrichable_transactions.in_batches(of: batch_size, load: true) do |batch|
batch_index += 1
percent_complete = ((batch_index.to_f / total_batches) * 100).round
Rails.logger.info("Processing batch #{batch_index} of #{total_batches} (#{percent_complete}% complete) for rule #{rule.id}")
success = process_batch(batch)
if success
consecutive_failures = 0
else
consecutive_failures += 1
break if consecutive_failures >= 3
end
end
end
private
def llm_provider
rule.llm_provider
end
def process_batch(batch)
result = llm_provider.auto_categorize(
transactions: prepare_transaction_input(batch),
user_categories: user_categories
)
unless result.success?
Rails.logger.error("Failed to auto-categorize transactions for rule #{rule.id}: #{result.error.message}")
return false
end
batch.each do |txn|
txn.lock!(:category_id)
auto_categorization = result.data.find { |c| c.transaction_id == txn.id }
if auto_categorization.present?
category_id = user_categories.find { |c| c[:name] == auto_categorization.category_name }&.dig(:id)
if category_id.present?
DataEnrichment.transaction do
de = DataEnrichment.find_or_create_by!(
enrichable: txn,
attribute_name: "category_id",
value: category_id,
source: "rule"
)
de.value = category_id
de.save!
txn.update!(category_id: category_id)
end
end
end
end
true
end
def prepare_transaction_input(transactions)
transactions.map do |transaction|
{
id: transaction.id,
amount: transaction.entry.amount.abs,
classification: transaction.entry.classification,
description: transaction.entry.name,
merchant: transaction.merchant&.name
}
end
end
def user_categories
rule.family.categories.map do |category|
{
id: category.id,
name: category.name,
is_subcategory: category.subcategory?,
parent_id: category.parent_id,
classification: category.classification
}
end
end
end

View file

@ -17,17 +17,19 @@ class Rule::ActionExecutor::SetTransactionCategory < Rule::ActionExecutor
end
scope.each do |txn|
txn.update!(category: category)
DataEnrichment.transaction do
txn.update!(category: category)
de = DataEnrichment.find_or_create_by!(
enrichable: txn,
attribute_name: "category_id",
value: category.id,
source: "rule"
)
de = DataEnrichment.find_or_create_by!(
enrichable: txn,
attribute_name: "category_id",
value: category.id,
source: "rule"
)
de.value = category.id
de.save!
de.value = category.id
de.save!
end
end
end
end

View file

@ -17,6 +17,11 @@ class Rule::ConditionFilter
"text"
end
def number_step
family_currency = Money::Currency.new(family.currency)
family_currency.step
end
def key
self.class.name.demodulize.underscore
end
@ -49,7 +54,8 @@ class Rule::ConditionFilter
key: key,
label: label,
operators: operators,
options: options
options: options,
number_step: number_step
}
end

View file

@ -0,0 +1,21 @@
module Rule::Provided
extend ActiveSupport::Concern
class_methods do
def llm_provider
Provider::Registry.get_provider(:openai)
end
def synth
Provider::Registry.get_provider(:synth)
end
end
def llm_provider
self.class.llm_provider
end
def synth
self.class.synth
end
end

View file

@ -1,6 +1,6 @@
class Rule::Registry::TransactionResource < Rule::Registry
def resource_scope
family.transactions.active
family.transactions.active.with_entry.where(entry: { date: rule.effective_date.. })
end
def condition_filters

View file

@ -15,7 +15,7 @@
<%= tag.div class: class_names("min-w-1/2 flex items-center gap-2", "hidden" => !needs_value),
data: { rule__actions_target: "actionValue" } do %>
<span class="font-medium uppercase text-xs">to</span>
<%= form.select :value, action.options, {}, disabled: !needs_value %>
<%= form.select :value, action.options || [], {}, disabled: !needs_value %>
<% end %>
</div>

View file

@ -24,7 +24,7 @@
<%= form.select :value, condition.options, {} %>
<% else %>
<% if condition.filter.type == "number" %>
<%= form.number_field :value, placeholder: "10" %>
<%= form.number_field :value, placeholder: "10", step: 0.01 %>
<% else %>
<%= form.text_field :value, placeholder: "Enter a value" %>
<% end %>

View file

@ -17,6 +17,35 @@ class Provider::OpenaiTest < ActiveSupport::TestCase
end
end
test "auto categorizes transactions by various attributes" do
VCR.use_cassette("openai/auto_categorize") do
input_transactions = [
{ id: "1", name: "McDonalds", amount: 20, classification: "expense", merchant: "McDonalds", hint: "Fast Food" },
{ id: "2", name: "Amazon purchase", amount: 100, classification: "expense", merchant: "Amazon" },
{ id: "3", name: "Netflix subscription", amount: 10, classification: "expense", merchant: "Netflix", hint: "Subscriptions" }
]
response = @subject.auto_categorize(
transactions: input_transactions,
user_categories: [
{ id: "shopping_id", name: "Shopping", is_subcategory: false, parent_id: nil, classification: "expense" },
{ id: "restaurants_id", name: "Restaurants", is_subcategory: false, parent_id: nil, classification: "expense" }
]
)
assert response.success?
assert_equal input_transactions.size, response.data.size
txn1 = response.data.find { |c| c.transaction_id == "1" }
txn2 = response.data.find { |c| c.transaction_id == "2" }
txn3 = response.data.find { |c| c.transaction_id == "3" }
assert_equal "Restaurants", txn1.category_name
assert_equal "Shopping", txn2.category_name
assert_nil txn3.category_name
end
end
test "basic chat response" do
VCR.use_cassette("openai/chat/basic_response") do
response = @subject.chat_response(

View file

@ -0,0 +1,45 @@
require "test_helper"
class Rule::ActionExecutor::AiAutoCategorizeTest < ActiveSupport::TestCase
include EntriesTestHelper, ProviderTestHelper
setup do
@family = families(:dylan_family)
@account = @family.accounts.create!(name: "Rule test", balance: 100, currency: "USD", accountable: Depository.new)
@llm_provider = mock
@rule = rules(:one)
Rule.any_instance.stubs(:llm_provider).returns(@llm_provider)
end
test "auto-categorizes transactions" do
txn1 = create_transaction(account: @account, name: "McDonalds").transaction
txn2 = create_transaction(account: @account, name: "Amazon purchase").transaction
txn3 = create_transaction(account: @account, name: "Netflix subscription").transaction
test_category = @family.categories.create!(name: "Test category")
provider_response = provider_success_response([
AutoCategorization.new(transaction_id: txn1.id, category_name: test_category.name),
AutoCategorization.new(transaction_id: txn2.id, category_name: test_category.name),
AutoCategorization.new(transaction_id: txn3.id, category_name: nil)
])
@llm_provider.expects(:auto_categorize).returns(provider_response).once
# All 3 of newly created transactions are enrichable by category_id
assert_equal 3, @account.transactions.reload.enrichable(:category_id).count
Rule::ActionExecutor::AiAutoCategorize.new(@rule).execute(@account.transactions)
assert_equal test_category, txn1.reload.category
assert_equal test_category, txn2.reload.category
assert_nil txn3.reload.category
# After auto-categorization, all transactions are locked and no longer enrichable
assert_equal 0, @account.transactions.reload.enrichable(:category_id).count
end
private
AutoCategorization = Provider::LlmConcept::AutoCategorization
end

View file

@ -0,0 +1,184 @@
---
http_interactions:
- request:
method: post
uri: https://api.openai.com/v1/responses
body:
encoding: UTF-8
string: '{"model":"gpt-4o-mini","input":[{"role":"developer","content":"Here
are the user''s available categories in JSON format:\n\n```json\n[{\"id\":\"shopping_id\",\"name\":\"Shopping\",\"is_subcategory\":false,\"parent_id\":null,\"classification\":\"expense\"},{\"id\":\"restaurants_id\",\"name\":\"Restaurants\",\"is_subcategory\":false,\"parent_id\":null,\"classification\":\"expense\"}]\n```\n\nUse
the available categories to auto-categorize the following transactions:\n\n```json\n[{\"id\":\"1\",\"name\":\"McDonalds\",\"amount\":20,\"classification\":\"expense\",\"merchant\":\"McDonalds\",\"hint\":\"Fast
Food\"},{\"id\":\"2\",\"name\":\"Amazon purchase\",\"amount\":100,\"classification\":\"expense\",\"merchant\":\"Amazon\"},{\"id\":\"3\",\"name\":\"Netflix
subscription\",\"amount\":10,\"classification\":\"expense\",\"merchant\":\"Netflix\",\"hint\":\"Subscriptions\"}]\n```\n"}],"text":{"format":{"type":"json_schema","name":"auto_categorize_personal_finance_transactions","strict":true,"schema":{"type":"object","properties":{"categorizations":{"type":"array","description":"An
array of auto-categorizations for each transaction","items":{"type":"object","properties":{"transaction_id":{"type":"string","description":"The
internal ID of the original transaction","enum":["1","2","3"]},"category_name":{"type":"string","description":"The
matched category name of the transaction, or null if no match","enum":["Shopping","Restaurants","null"]}},"required":["transaction_id","category_name"],"additionalProperties":false}}},"required":["categorizations"],"additionalProperties":false}}},"instructions":"You
are an assistant to a consumer personal finance app. You will be provided
a list\nof the user''s transactions and a list of the user''s categories. Your
job is to auto-categorize\neach transaction.\n\nClosely follow ALL the rules
below while auto-categorizing:\n\n- Return 1 result per transaction\n- Correlate
each transaction by ID (transaction_id)\n- Attempt to match the most specific
category possible (i.e. subcategory over parent category)\n- Category and
transaction classifications should match (i.e. if transaction is an \"expense\",
the category must have classification of \"expense\")\n- If you don''t know
the category, return \"null\"\n - You should always favor \"null\" over false
positives\n - Be slightly pessimistic. Only match a category if you''re
60%+ confident it is the correct one.\n- Each transaction has varying metadata
that can be used to determine the category\n - Note: \"hint\" comes from
3rd party aggregators and typically represents a category name that\n may
or may not match any of the user-supplied categories\n"}'
headers:
Content-Type:
- application/json
Authorization:
- Bearer <OPENAI_ACCESS_TOKEN>
Accept-Encoding:
- gzip;q=1.0,deflate;q=0.6,identity;q=0.3
Accept:
- "*/*"
User-Agent:
- Ruby
response:
status:
code: 200
message: OK
headers:
Date:
- Tue, 15 Apr 2025 23:37:03 GMT
Content-Type:
- application/json
Transfer-Encoding:
- chunked
Connection:
- keep-alive
Openai-Version:
- '2020-10-01'
Openai-Organization:
- user-r6cwd3mn6iv6gn748b2xoajx
X-Request-Id:
- req_66a1351ce255af7a2d6d97f291443f0f
Openai-Processing-Ms:
- '4685'
Strict-Transport-Security:
- max-age=31536000; includeSubDomains; preload
Cf-Cache-Status:
- DYNAMIC
Set-Cookie:
- __cf_bm=Cz9arSNtnMJ5YW26HFXEFAAEuy5cyoZ1zOWjBLXvZiQ-1744760223-1.0.1.1-S15VvCx.x3tvOi74lMrJ_5XVpwtCEOtsLJ_3fbLW.qfsY4Q9.8mPYKA_PSa97.9t5iL4VxtfJZj0DrI9kDSq3aMk2Y3ajdPUHNAQnZUp3vI;
path=/; expires=Wed, 16-Apr-25 00:07:03 GMT; domain=.api.openai.com; HttpOnly;
Secure; SameSite=None
- _cfuvid=O9qfsS9kbZw6J4uiNLYwZt2lDb1iY.XZjbiUDTVWRq4-1744760223916-0.0.1.1-604800000;
path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
X-Content-Type-Options:
- nosniff
Server:
- cloudflare
Cf-Ray:
- 930f44a99dbbd287-CMH
Alt-Svc:
- h3=":443"; ma=86400
body:
encoding: ASCII-8BIT
string: |-
{
"id": "resp_67feed9b386081929a83999f7c7167fa0e152c5eb776dde0",
"object": "response",
"created_at": 1744760219,
"status": "completed",
"error": null,
"incomplete_details": null,
"instructions": "You are an assistant to a consumer personal finance app. You will be provided a list\nof the user's transactions and a list of the user's categories. Your job is to auto-categorize\neach transaction.\n\nClosely follow ALL the rules below while auto-categorizing:\n\n- Return 1 result per transaction\n- Correlate each transaction by ID (transaction_id)\n- Attempt to match the most specific category possible (i.e. subcategory over parent category)\n- Category and transaction classifications should match (i.e. if transaction is an \"expense\", the category must have classification of \"expense\")\n- If you don't know the category, return \"null\"\n - You should always favor \"null\" over false positives\n - Be slightly pessimistic. Only match a category if you're 60%+ confident it is the correct one.\n- Each transaction has varying metadata that can be used to determine the category\n - Note: \"hint\" comes from 3rd party aggregators and typically represents a category name that\n may or may not match any of the user-supplied categories\n",
"max_output_tokens": null,
"model": "gpt-4o-mini-2024-07-18",
"output": [
{
"id": "msg_67feed9f6d948192824b8b23db6562720e152c5eb776dde0",
"type": "message",
"status": "completed",
"content": [
{
"type": "output_text",
"annotations": [],
"text": "{\"categorizations\":[{\"transaction_id\":\"1\",\"category_name\":\"Restaurants\"},{\"transaction_id\":\"2\",\"category_name\":\"Shopping\"},{\"transaction_id\":\"3\",\"category_name\":\"null\"}]}"
}
],
"role": "assistant"
}
],
"parallel_tool_calls": true,
"previous_response_id": null,
"reasoning": {
"effort": null,
"generate_summary": null
},
"store": true,
"temperature": 1.0,
"text": {
"format": {
"type": "json_schema",
"description": null,
"name": "auto_categorize_personal_finance_transactions",
"schema": {
"type": "object",
"properties": {
"categorizations": {
"type": "array",
"description": "An array of auto-categorizations for each transaction",
"items": {
"type": "object",
"properties": {
"transaction_id": {
"type": "string",
"description": "The internal ID of the original transaction",
"enum": [
"1",
"2",
"3"
]
},
"category_name": {
"type": "string",
"description": "The matched category name of the transaction, or null if no match",
"enum": [
"Shopping",
"Restaurants",
"null"
]
}
},
"required": [
"transaction_id",
"category_name"
],
"additionalProperties": false
}
}
},
"required": [
"categorizations"
],
"additionalProperties": false
},
"strict": true
}
},
"tool_choice": "auto",
"tools": [],
"top_p": 1.0,
"truncation": "disabled",
"usage": {
"input_tokens": 511,
"input_tokens_details": {
"cached_tokens": 0
},
"output_tokens": 39,
"output_tokens_details": {
"reasoning_tokens": 0
},
"total_tokens": 550
},
"user": null,
"metadata": {}
}
recorded_at: Tue, 15 Apr 2025 23:37:03 GMT
recorded_with: VCR 6.3.1