From dc3613d002e3f7f4d9c3fae4655abd265bf2e364 Mon Sep 17 00:00:00 2001
From: Leonardo Valeriano Neri <leonardovalerianoneri@gmail.com>
Date: Tue, 21 Jan 2025 10:06:22 -0300
Subject: [PATCH 01/24] fix: Dockerfile to build and pull images.

---
 .dockerignore         |  1 -
 docker-compose.yml    |  2 +-
 pull_model.Dockerfile | 37 ++-----------------------------------
 pull_model.clj        | 32 ++++++++++++++++++++++++++++++++
 4 files changed, 35 insertions(+), 37 deletions(-)
 create mode 100644 pull_model.clj

diff --git a/.dockerignore b/.dockerignore
index 3466d3150..9f1a0a399 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,4 +1,3 @@
-*
 !*.py
 !requirements.txt
 !images/*
diff --git a/docker-compose.yml b/docker-compose.yml
index 7dacfd59c..2609cd977 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -30,7 +30,7 @@ services:
     tty: true
 
   database:
-    user: neo4j:neo4j
+    #user: neo4j:neo4j
     image: neo4j:5.23
     ports:
       - 7687:7687
diff --git a/pull_model.Dockerfile b/pull_model.Dockerfile
index b06625f7d..eb0ad4cd4 100644
--- a/pull_model.Dockerfile
+++ b/pull_model.Dockerfile
@@ -6,40 +6,7 @@ FROM babashka/babashka:latest
 # just using as a client - never as a server
 COPY --from=ollama /bin/ollama ./bin/ollama
 
-COPY <<EOF pull_model.clj
-(ns pull-model
-  (:require [babashka.process :as process]
-            [clojure.core.async :as async]))
+COPY pull_model.clj /usr/src/pull_model.clj
 
-(try
-  (let [llm (get (System/getenv) "LLM")
-        url (get (System/getenv) "OLLAMA_BASE_URL")]
-    (println (format "pulling ollama model %s using %s" llm url))
-    (if (and llm 
-         url 
-         (not (#{"gpt-4" "gpt-3.5" "claudev2" "gpt-4o" "gpt-4-turbo"} llm))
-         (not (some #(.startsWith llm %) ["ai21.jamba-instruct-v1:0"
-                                          "amazon.titan"
-                                          "anthropic.claude"
-                                          "cohere.command"
-                                          "meta.llama"
-                                          "mistral.mi"])))
-
-      ;; ----------------------------------------------------------------------
-      ;; just call `ollama pull` here - create OLLAMA_HOST from OLLAMA_BASE_URL
-      ;; ----------------------------------------------------------------------
-      ;; TODO - this still doesn't show progress properly when run from docker compose
-
-      (let [done (async/chan)]
-        (async/go-loop [n 0]
-          (let [[v _] (async/alts! [done (async/timeout 5000)])]
-            (if (= :stop v) :stopped (do (println (format "... pulling model (%ss) - will take several minutes" (* n 10))) (recur (inc n))))))
-        (process/shell {:env {"OLLAMA_HOST" url "HOME" (System/getProperty "user.home")} :out :inherit :err :inherit} (format "bash -c './bin/ollama show %s --modelfile > /dev/null || ./bin/ollama pull %s'" llm llm))
-        (async/>!! done :stop))
-
-      (println "OLLAMA model only pulled if both LLM and OLLAMA_BASE_URL are set and the LLM model is not gpt")))
-  (catch Throwable _ (System/exit 1)))
-EOF
-
-ENTRYPOINT ["bb", "-f", "pull_model.clj"]
+ENTRYPOINT ["bb", "-f", "/usr/src/pull_model.clj"]
 
diff --git a/pull_model.clj b/pull_model.clj
new file mode 100644
index 000000000..99bffc6dd
--- /dev/null
+++ b/pull_model.clj
@@ -0,0 +1,32 @@
+(ns pull-model
+  (:require [babashka.process :as process]
+            [clojure.core.async :as async]))
+
+(try
+  (let [llm (get (System/getenv) "LLM")
+        url (get (System/getenv) "OLLAMA_BASE_URL")]
+    (println (format "pulling ollama model %s using %s" llm url))
+    (if (and llm
+         url
+         (not (#{"gpt-4" "gpt-3.5" "claudev2" "gpt-4o" "gpt-4-turbo"} llm))
+         (not (some #(.startsWith llm %) ["ai21.jamba-instruct-v1:0"
+                                          "amazon.titan"
+                                          "anthropic.claude"
+                                          "cohere.command"
+                                          "meta.llama"
+                                          "mistral.mi"])))
+
+      ;; ----------------------------------------------------------------------
+      ;; just call `ollama pull` here - create OLLAMA_HOST from OLLAMA_BASE_URL
+      ;; ----------------------------------------------------------------------
+      ;; TODO - this still doesn't show progress properly when run from docker compose
+
+      (let [done (async/chan)]
+        (async/go-loop [n 0]
+          (let [[v _] (async/alts! [done (async/timeout 5000)])]
+            (if (= :stop v) :stopped (do (println (format "... pulling model (%ss) - will take several minutes" (* n 10))) (recur (inc n))))))
+        (process/shell {:env {"OLLAMA_HOST" url "HOME" (System/getProperty "user.home")} :out :inherit :err :inherit} (format "bash -c './bin/ollama show %s --modelfile > /dev/null || ./bin/ollama pull %s'" llm llm))
+        (async/>!! done :stop))
+
+      (println "OLLAMA model only pulled if both LLM and OLLAMA_BASE_URL are set and the LLM model is not gpt")))
+  (catch Throwable _ (System/exit 1)))

From 6940459d54fb1c66cc53e4c71b8fb8412dc1f359 Mon Sep 17 00:00:00 2001
From: Leonardo Valeriano Neri <leonardovalerianoneri@gmail.com>
Date: Thu, 13 Feb 2025 14:36:10 -0300
Subject: [PATCH 02/24] feat: bot using tesseract to extract text from images
 and implementRAG.

---
 docker-compose.yml            |  43 +++++++++++
 multiple_files_bot.Dockerfile |  28 ++++++++
 multiple_files_bot.py         | 130 ++++++++++++++++++++++++++++++++++
 requirements.txt              |   2 +
 4 files changed, 203 insertions(+)
 create mode 100644 multiple_files_bot.Dockerfile
 create mode 100644 multiple_files_bot.py

diff --git a/docker-compose.yml b/docker-compose.yml
index 2609cd977..dfbad246c 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -86,6 +86,7 @@ services:
             - bot.py
             - pdf_bot.py
             - api.py
+            - multiple_files_bot.py
             - front-end/
     ports:
       - 8081:8080
@@ -129,6 +130,7 @@ services:
             - loader.py
             - pdf_bot.py
             - api.py
+            - multiple_files_bot.py
             - front-end/
     ports:
       - 8501:8501
@@ -168,10 +170,50 @@ services:
             - loader.py
             - bot.py
             - api.py
+            - multiple_files_bot.py
             - front-end/
     ports:
       - 8503:8503
 
+  multiple_files_bot:
+    build:
+      context: .
+      dockerfile: multiple_files_bot.Dockerfile
+    environment:
+      - NEO4J_URI=${NEO4J_URI-neo4j://database:7687}
+      - NEO4J_PASSWORD=${NEO4J_PASSWORD-password}
+      - NEO4J_USERNAME=${NEO4J_USERNAME-neo4j}
+      - OPENAI_API_KEY=${OPENAI_API_KEY-}
+      - GOOGLE_API_KEY=${GOOGLE_API_KEY-}
+      - OLLAMA_BASE_URL=${OLLAMA_BASE_URL-http://host.docker.internal:11434}
+      - LLM=${LLM-llama2}
+      - EMBEDDING_MODEL=${EMBEDDING_MODEL-sentence_transformer}
+      - LANGCHAIN_ENDPOINT=${LANGCHAIN_ENDPOINT-"https://api.smith.langchain.com"}
+      - LANGCHAIN_TRACING_V2=${LANGCHAIN_TRACING_V2-false}
+      - LANGCHAIN_PROJECT=${LANGCHAIN_PROJECT}
+      - LANGCHAIN_API_KEY=${LANGCHAIN_API_KEY}
+      - AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
+      - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
+      - AWS_DEFAULT_REGION=${AWS_DEFAULT_REGION}
+    networks:
+      - net
+    depends_on:
+      database:
+        condition: service_healthy
+      pull-model:
+        condition: service_completed_successfully
+    x-develop:
+      watch:
+        - action: rebuild
+          path: .
+          ignore:
+            - loader.py
+            - bot.py
+            - api.py
+            - front-end/
+    ports:
+      - 8505:8505
+
   api:
     build:
       context: .
@@ -209,6 +251,7 @@ services:
             - loader.py
             - bot.py
             - pdf_bot.py
+            - multiple_files_bot.py
             - front-end/
     ports:
       - 8504:8504
diff --git a/multiple_files_bot.Dockerfile b/multiple_files_bot.Dockerfile
new file mode 100644
index 000000000..1845ccdce
--- /dev/null
+++ b/multiple_files_bot.Dockerfile
@@ -0,0 +1,28 @@
+FROM langchain/langchain
+
+WORKDIR /app
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    software-properties-common \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN apt-get update && apt-get install -y \
+    tesseract-ocr \
+    libtesseract-dev \
+    tesseract-ocr-por
+
+COPY requirements.txt .
+
+RUN pip install --upgrade -r requirements.txt
+
+COPY multiple_files_bot.py .
+COPY utils.py .
+COPY chains.py .
+
+EXPOSE 8505
+
+HEALTHCHECK CMD curl --fail http://localhost:8503/_stcore/health
+
+ENTRYPOINT ["streamlit", "run", "multiple_files_bot.py", "--server.port=8505", "--server.address=0.0.0.0"]
diff --git a/multiple_files_bot.py b/multiple_files_bot.py
new file mode 100644
index 000000000..96991a2fb
--- /dev/null
+++ b/multiple_files_bot.py
@@ -0,0 +1,130 @@
+import os
+
+import streamlit as st
+from langchain.chains import RetrievalQA
+from PyPDF2 import PdfReader
+from pdf2image import convert_from_path
+from PIL import Image
+import pytesseract
+from langchain.callbacks.base import BaseCallbackHandler
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import Neo4jVector
+from langchain_community.document_loaders import ConcurrentLoader
+from streamlit.logger import get_logger
+from chains import (
+    load_embedding_model,
+    load_llm,
+)
+
+# load api key lib
+from dotenv import load_dotenv
+
+load_dotenv(".env")
+
+
+url = os.getenv("NEO4J_URI")
+username = os.getenv("NEO4J_USERNAME")
+password = os.getenv("NEO4J_PASSWORD")
+ollama_base_url = os.getenv("OLLAMA_BASE_URL")
+embedding_model_name = os.getenv("EMBEDDING_MODEL")
+llm_name = os.getenv("LLM")
+# Remapping for Langchain Neo4j integration
+os.environ["NEO4J_URL"] = url
+
+logger = get_logger(__name__)
+
+
+embeddings, dimension = load_embedding_model(
+    embedding_model_name, config={"ollama_base_url": ollama_base_url}, logger=logger
+)
+
+
+class StreamHandler(BaseCallbackHandler):
+    def __init__(self, container, initial_text=""):
+        self.container = container
+        self.text = initial_text
+
+    def on_llm_new_token(self, token: str, **kwargs) -> None:
+        self.text += token
+        self.container.markdown(self.text)
+
+
+llm = load_llm(llm_name, logger=logger, config={"ollama_base_url": ollama_base_url})
+
+
+def main():
+    st.header("📄 Revise a Minuta da Escritura")
+
+    # upload a your files
+    uploaded_files = st.file_uploader(
+			"Arraste os documentos necessários e a minuta da Escritura (PDF, png, jpeg, ou arquivos .txt)", 
+			accept_multiple_files=True,
+			type=["png", "jpg", "jpeg", "pdf", "txt"]
+	  	     )
+
+    text = ""
+    for file in uploaded_files:
+        bytes_data = file.read()
+        file_format = file.name.split('.')[1].lower()
+        text += f"NOME DO ARQUIVO: {file.name}"
+
+        match file_format:
+            case 'pdf':
+                try:
+                    pdf_reader = PdfReader(file)
+                    for page in pdf_reader.pages:
+                        text += page.extract_text()
+                except:
+                    images = convert_from_bytes(bytes_data)
+                    for i, image in enumerate(images):
+                        # image.save(file.name.split('.')[0] + '.png')
+                        text += pytesseract.image_to_string(image, lang='por')
+            case 'txt':
+                #with open(file, encoding='utf8', mode='r') as f:
+                for line in file:
+                    text += file.read()
+                    st.write(file.read())
+            case 'png':
+                text += pytesseract.image_to_string(Image.open(file), lang='por')
+            case 'jpg':
+                text += pytesseract.image_to_string(Image.open(file), lang='por')
+            case 'jpeg':
+                text += pytesseract.image_to_string(Image.open(file), lang='por')
+            case _:
+                st.write("Formato do arquivo:", file.name, "não é suportado!")
+
+    # langchain_textspliter
+    text_splitter = RecursiveCharacterTextSplitter(
+                        chunk_size=1000, chunk_overlap=200, length_function=len
+                    )
+    chunks = text_splitter.split_text(text=text)
+
+    # Store the chuncks part in db (vector)
+    vectorstore = Neo4jVector.from_texts(
+        chunks,
+        url=url,
+        username=username,
+        password=password,
+        embedding=embeddings,
+        index_name="multiple_files_bot",
+        node_label="MultipleFilesBotChunk",
+        pre_delete_collection=True, # Delete existing data in collection
+    )
+    questions_and_answers = RetrievalQA.from_chain_type(
+        llm=llm, chain_type="stuff", retriever=vectorstore.as_retriever()
+    )
+
+    # Accept user questions/query
+    query = st.text_input(
+        """
+        Faça a revisão das cláusulas da Minuta pedindo para a IA extrair os dados dos documentos em anexo e 
+        comparar com os dados da Minuta.
+        """
+    )
+
+    if query:
+        stream_handler = StreamHandler(st.empty())
+        questions_and_answers.run(query, callbacks=[stream_handler])
+
+if __name__ == "__main__":
+    main()
diff --git a/requirements.txt b/requirements.txt
index 2670d2535..db6a1e032 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,9 @@ neo4j
 streamlit
 Pillow
 fastapi
+pdf2image==1.17.0
 PyPDF2
+pytesseract
 pydantic
 uvicorn
 sse-starlette

From 88e27bb41ca70c127db51ee07720789ef2b15ad1 Mon Sep 17 00:00:00 2001
From: Leonardo Valeriano Neri <leonardovalerianoneri@gmail.com>
Date: Sat, 29 Mar 2025 16:43:40 -0300
Subject: [PATCH 03/24] Creating a prompt for each document.

---
 prompts.json | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 prompts.json

diff --git a/prompts.json b/prompts.json
new file mode 100644
index 000000000..d90c4c46f
--- /dev/null
+++ b/prompts.json
@@ -0,0 +1,44 @@
+{
+    "CNH": {
+        "v1": "Extraia do documento CNH os dados nos campos: Nome completo, nacionalidade, data de nascimento, RG e órgão expedidor e CPF.",
+
+        "v2": "Extraia do documento CNH os dados nos campos: Nome completo (1º Nome encontrado), nacionalidade, data de nascimento, RG com órgão expedidor e CPF (localizado após o RG).",
+
+        "v3": "Extraia do documento CNH os dados nos campos: Nome completo (1º Nome encontrado), nacionalidade (faça inferência, se necessário), data de nascimento, RG com órgão expedidor e CPF (11 dígitos, localizado após o RG). Informe se a data de vencimento é maior que a data atual: 25/02/2025 (em caso negativo, retorne DOCUMENTO INVÁLIDO).",
+
+        "v4": "Extraia do documento CNH os dados nos campos (alguns campos podem possuir escrita parecida com os dados a seguir, tente buscar campo com nome parecido): Nome completo (1º Nome encontrado), nacionalidade (faça inferência, se necessário), data de nascimento, RG com órgão expedidor e CPF (11 dígitos, localizado após o RG). Informe se a data de validade é maior que a data atual: 25/02/2025 (em caso negativo, retorne DOCUMENTO INVÁLIDO).",
+        
+        "latest": {
+            "prompt": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brasil). Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'CNH'. Always return a structured table consolidating the information in the end of the answer.",
+            "input": "Extraia os dados nos campos: Nome completo (1º Nome encontrado), nacionalidade (faça inferência, se necessário), data de nascimento, RG com órgão expedidor e CPF (11 dígitos separados por '.' e '-', localizado após o RG). Informe se a data de validade é maior que a data atual: 25/02/2025 (em caso negativo, retorne DOCUMENTO INVÁLIDO)."
+        },
+    
+        "resposta": "Nome Completo: MARLI SILVA DE ANDRADE; Nacionalidade: Brasileira (inferente do órgão emitente); Data de Nascimento: 19/08/1968; RG com Órgão Expedidor: 3198072 - SSP PE; CPF: Não localizado na informação fornecida; Validade do Documento: Até 29/04/2026. Como esta data está após 25/02/2025, o documento é válido."
+    },
+
+    "Comprovante de Residência": {
+        "v1": "Extraia do documento 'Comprovante de Residência' os dados relacionados a endereço e CEP (Caixa Postal/ZIP Code).",
+
+        "latest": {
+            "prompt":"You are an assistant for question-answering tasks. Always answer in Portuguese (Brasil). Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'Comprovante de Residência'. Always return a structured table consolidating the information in the end of the answer.",
+            "input": "Extraia os dados relacionados a endereço e CEP (Caixa Postal/ZIP Code). Apresente essas informações por extenso sem as abreviações de avenida, rua e apartamento/lote. Verifique se a data de envio do documento possui até 30 dias de diferença da data atual: 25/02/2025 (em caso negativo, retorne DOCUMENTO INVÁLIDO)." 
+        },
+
+        "resposta": "Nome: WILSON PEREIRA DE LIMA; Endereço: Rua Setubal, 1245 - Apartamento 1402; CEP (Caixa Postal): 51130-010; Localização: Recife - PE."
+    },
+
+    "Certidão de Casamento": {
+        "v1": "Extraia do documento 'Certidão de Casamento' os dados relacionados ao Cônjuge: Nome, Documento de Identificação e Data do Casamento. Extrair o dado sobre o tipo de Regime de Bens.",
+
+        "v2": "Extraia do documento 'Certidão de Casamento' os dados relacionados ao Cônjuge: Nome, Documento de Identificação e Data do Casamento. Extrair o dado sobre o tipo de Regime de Bens e extrair os dados de registro da certidão e onde a certidão foi emitida.",
+
+        "v3": "Extraia do documento 'Certidão de Casamento' os dados relacionados ao Cônjuge: Nome, Documento de Identificação e Data do Casamento. Extrair o dado sobre o tipo de Regime de Bens. Extrair o número de registro da certidão, onde a certidão foi emitida e a data de emissão da Certidão.",
+
+        "v4": "Extraia do documento 'Certidão de Casamento' os dados relacionados ao Cônjuge: Nome, Documento de Identificação e Data do Casamento. Extrair o dado sobre o tipo de Regime de Bens. Extrair o número de registro da certidão, onde a certidão foi emitida e a data de emissão da Certidão.",
+
+        "latest": {
+            "prompt": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brazil) and return a structured table consolidating the information at the end of the answer. Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'Certidão de Casamento'.",
+            "input": "Extraia dados relacionados ao Cônjuge: Nome, Documento de Identificação e Data do Casamento. Extrair o dado sobre o tipo de Regime de Bens. Extrair o número de registro da certidão, onde a certidão foi emitida e a data de emissão da Certidão."
+        }
+    }
+}

From 8ff1437f5a5a5d2d0a5c0177057f4820ee760c14 Mon Sep 17 00:00:00 2001
From: Leonardo Valeriano Neri <leonardovalerianoneri@gmail.com>
Date: Sat, 29 Mar 2025 16:45:14 -0300
Subject: [PATCH 04/24] refactoring app to process and run queries about the
 documents.

---
 multiple_files_bot.Dockerfile |   2 +
 multiple_files_bot.py         | 207 ++++++++++++++++++++++------------
 2 files changed, 137 insertions(+), 72 deletions(-)

diff --git a/multiple_files_bot.Dockerfile b/multiple_files_bot.Dockerfile
index 1845ccdce..ad050dc91 100644
--- a/multiple_files_bot.Dockerfile
+++ b/multiple_files_bot.Dockerfile
@@ -9,6 +9,7 @@ RUN apt-get update && apt-get install -y \
     && rm -rf /var/lib/apt/lists/*
 
 RUN apt-get update && apt-get install -y \
+    poppler-utils \
     tesseract-ocr \
     libtesseract-dev \
     tesseract-ocr-por
@@ -18,6 +19,7 @@ COPY requirements.txt .
 RUN pip install --upgrade -r requirements.txt
 
 COPY multiple_files_bot.py .
+COPY prompts.json .
 COPY utils.py .
 COPY chains.py .
 
diff --git a/multiple_files_bot.py b/multiple_files_bot.py
index 96991a2fb..7d92b4c82 100644
--- a/multiple_files_bot.py
+++ b/multiple_files_bot.py
@@ -1,15 +1,17 @@
 import os
+import json
 
 import streamlit as st
-from langchain.chains import RetrievalQA
-from PyPDF2 import PdfReader
-from pdf2image import convert_from_path
+from pdf2image import convert_from_bytes
 from PIL import Image
 import pytesseract
+from langchain.chains import RetrievalQA
 from langchain.callbacks.base import BaseCallbackHandler
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import Neo4jVector
-from langchain_community.document_loaders import ConcurrentLoader
+from langchain_core.prompts import ChatPromptTemplate
+from langchain.chains import create_retrieval_chain
+from langchain.chains.combine_documents import create_stuff_documents_chain
 from streamlit.logger import get_logger
 from chains import (
     load_embedding_model,
@@ -39,6 +41,14 @@
 )
 
 
+prompts = dict()
+with open('prompts.json', 'rb') as f:
+    prompts = json.load(f)
+
+
+llm = load_llm(llm_name, logger=logger, config={"ollama_base_url": ollama_base_url})
+
+
 class StreamHandler(BaseCallbackHandler):
     def __init__(self, container, initial_text=""):
         self.container = container
@@ -49,82 +59,135 @@ def on_llm_new_token(self, token: str, **kwargs) -> None:
         self.container.markdown(self.text)
 
 
-llm = load_llm(llm_name, logger=logger, config={"ollama_base_url": ollama_base_url})
-
-
-def main():
-    st.header("📄 Revise a Minuta da Escritura")
-
-    # upload a your files
-    uploaded_files = st.file_uploader(
-			"Arraste os documentos necessários e a minuta da Escritura (PDF, png, jpeg, ou arquivos .txt)", 
-			accept_multiple_files=True,
-			type=["png", "jpg", "jpeg", "pdf", "txt"]
-	  	     )
-
+def RAG_document_validator_and_text_extractor(document_name: str, uploaded_file):
     text = ""
-    for file in uploaded_files:
-        bytes_data = file.read()
-        file_format = file.name.split('.')[1].lower()
-        text += f"NOME DO ARQUIVO: {file.name}"
-
+    if uploaded_file:
+        bytes_data = uploaded_file.read()
+        file_format = uploaded_file.name.split('.')[1].lower()
+        
         match file_format:
             case 'pdf':
-                try:
-                    pdf_reader = PdfReader(file)
-                    for page in pdf_reader.pages:
-                        text += page.extract_text()
-                except:
-                    images = convert_from_bytes(bytes_data)
-                    for i, image in enumerate(images):
-                        # image.save(file.name.split('.')[0] + '.png')
-                        text += pytesseract.image_to_string(image, lang='por')
+                images = convert_from_bytes(bytes_data)
+                for i, image in enumerate(images):
+                    text += f"Página: {i} \n\n" + pytesseract.image_to_string(image, lang='por')
             case 'txt':
-                #with open(file, encoding='utf8', mode='r') as f:
-                for line in file:
-                    text += file.read()
-                    st.write(file.read())
+                for line in uploaded_file:
+                    text += line
             case 'png':
-                text += pytesseract.image_to_string(Image.open(file), lang='por')
+                text += pytesseract.image_to_string(Image.open(uploaded_file), lang='por')
             case 'jpg':
-                text += pytesseract.image_to_string(Image.open(file), lang='por')
+                text += pytesseract.image_to_string(Image.open(uploaded_file), lang='por')
             case 'jpeg':
-                text += pytesseract.image_to_string(Image.open(file), lang='por')
+                text += pytesseract.image_to_string(Image.open(uploaded_file), lang='por')
             case _:
-                st.write("Formato do arquivo:", file.name, "não é suportado!")
-
-    # langchain_textspliter
-    text_splitter = RecursiveCharacterTextSplitter(
-                        chunk_size=1000, chunk_overlap=200, length_function=len
-                    )
-    chunks = text_splitter.split_text(text=text)
-
-    # Store the chuncks part in db (vector)
-    vectorstore = Neo4jVector.from_texts(
-        chunks,
-        url=url,
-        username=username,
-        password=password,
-        embedding=embeddings,
-        index_name="multiple_files_bot",
-        node_label="MultipleFilesBotChunk",
-        pre_delete_collection=True, # Delete existing data in collection
-    )
-    questions_and_answers = RetrievalQA.from_chain_type(
-        llm=llm, chain_type="stuff", retriever=vectorstore.as_retriever()
-    )
-
-    # Accept user questions/query
-    query = st.text_input(
-        """
-        Faça a revisão das cláusulas da Minuta pedindo para a IA extrair os dados dos documentos em anexo e 
-        comparar com os dados da Minuta.
-        """
-    )
-
-    if query:
-        stream_handler = StreamHandler(st.empty())
-        questions_and_answers.run(query, callbacks=[stream_handler])
+                st.write("Formato do arquivo:", uploaded_file.name, "não é suportado!")
+
+        # langchain_textspliter
+        text_splitter = RecursiveCharacterTextSplitter(
+                            chunk_size=10000,
+                            chunk_overlap=200,
+                            length_function=len
+                        )
+        chunks = text_splitter.split_text(text=text)
+        chunks = [f"NOME_DO_DOCUMENTO: {document_name} " + chunk for chunk in chunks]
+
+        # Store the chuncks part in db (vector)
+        vectorstore = Neo4jVector.from_texts(
+            chunks,
+            url=url,
+            username=username,
+            password=password,
+            embedding=embeddings,
+            node_label=f"MultipleFilesBotChunk_{document_name}",
+            pre_delete_collection=True, # Delete existing data in collection
+        )
+
+        system_prompt = prompts[document_name].get('latest')['prompt'] + " Context: {context}"
+        prompt = ChatPromptTemplate(
+            [
+                ("system", system_prompt),
+                ("human", "{input}")
+            ]
+        )
+        qa_chain = create_stuff_documents_chain(llm, prompt)
+        agent_document_retreiver = create_retrieval_chain(vectorstore.as_retriever(), qa_chain)
+
+        # agent_document_retreiver = RetrievalQA.from_chain_type(
+        #     llm=llm, 
+        #     chain_type="stuff", 
+        #     retriever=vectorstore.as_retriever(), 
+        #     prompt=prompt
+        # )
+
+        return agent_document_retreiver
+        
+agents = dict()
+documents_list = ['CNH', 'Comprovante de Residência', 'Certidão de Casamento']
+if 'documents' not in st.session_state:
+    st.session_state.documents = []
+
+def main():
+    st.header("📄 Revise os documentos apresentados para a Escritura")
+
+    with st.sidebar:
+        st.title("Partes Envolvidas")
+
+        st.subheader("Parte Compradora")
+        st.text("Documentos apresentados:")
+        for doc in st.session_state.documents:
+            st.text(doc)
+
+        st.subheader("Parte Vendedora")
+        st.text("Documentos apresentados...")
+
+    tabs = st.tabs(documents_list)
+
+    for tab, document in zip(tabs, documents_list):
+        with tab:
+            # upload a your files
+            uploaded_file = st.file_uploader(
+                "Suba o documento em algum desses formatos: PDF, png, jpeg, ou txt.", 
+                accept_multiple_files=False,
+                type=["png", "jpg", "jpeg", "pdf", "txt"],
+                key=document
+            )
+
+            if uploaded_file:
+                st.write("A IA irá coletar e validar as informações presentes...")
+
+                # Text extraction and embedding using OCR and LLM to build a QA RAG
+                query = prompts[document].get('latest')['input']
+                agent = RAG_document_validator_and_text_extractor(document, uploaded_file)
+                answer = agent.invoke({'input': query})['answer']
+                stream_handler = StreamHandler(st.empty())
+                for token in answer:
+                    stream_handler.on_llm_new_token(token=token)
+                if document not in st.session_state.documents:
+                    st.session_state.documents.append(document)
+
+    # if any(uploaded_files):
+    #     for uploaded_file, document in zip(uploaded_files, documents_list):
+    #         agents[document] = RAG_document_validator_and_text_extractor(document, uploaded_file)
+        
+    #     st.write(
+    #         f"""
+    #         A IA vai revisar os documentos para extrair e validar as informações presentes.
+    #         """
+    #     )
+        
+    #     stream_handler = StreamHandler(st.empty())
+
+    #     for document in documents_list:
+    #         if document not in st.session_state.document:
+    #             # Accept user questions/query
+    #             query = prompts[document].get('latest')['input'] # st.text_input()
+    #             agent = agents.get(document, None)
+    #             if query and agent:
+    #                 st.session_state.document.append(document)
+    #                 answer = agent.invoke({'input': query})['answer']
+                    
+    #                 for token in answer:
+    #                     stream_handler.on_llm_new_token(token=token)
 
 if __name__ == "__main__":
     main()

From 69003b9dc472a80839ce3a2b9ca73092640fc8aa Mon Sep 17 00:00:00 2001
From: Leonardo Valeriano Neri <leonardovalerianoneri@gmail.com>
Date: Sat, 29 Mar 2025 16:49:45 -0300
Subject: [PATCH 05/24] remove commented code lines.

---
 multiple_files_bot.py | 23 -----------------------
 1 file changed, 23 deletions(-)

diff --git a/multiple_files_bot.py b/multiple_files_bot.py
index 7d92b4c82..ea30c2aca 100644
--- a/multiple_files_bot.py
+++ b/multiple_files_bot.py
@@ -165,29 +165,6 @@ def main():
                 if document not in st.session_state.documents:
                     st.session_state.documents.append(document)
 
-    # if any(uploaded_files):
-    #     for uploaded_file, document in zip(uploaded_files, documents_list):
-    #         agents[document] = RAG_document_validator_and_text_extractor(document, uploaded_file)
-        
-    #     st.write(
-    #         f"""
-    #         A IA vai revisar os documentos para extrair e validar as informações presentes.
-    #         """
-    #     )
-        
-    #     stream_handler = StreamHandler(st.empty())
-
-    #     for document in documents_list:
-    #         if document not in st.session_state.document:
-    #             # Accept user questions/query
-    #             query = prompts[document].get('latest')['input'] # st.text_input()
-    #             agent = agents.get(document, None)
-    #             if query and agent:
-    #                 st.session_state.document.append(document)
-    #                 answer = agent.invoke({'input': query})['answer']
-                    
-    #                 for token in answer:
-    #                     stream_handler.on_llm_new_token(token=token)
 
 if __name__ == "__main__":
     main()

From 0efcd322d11e169c8ad9a43d05d220b3ab1cb4ea Mon Sep 17 00:00:00 2001
From: Leonardo Valeriano Neri <leonardovalerianoneri@gmail.com>
Date: Tue, 8 Apr 2025 19:53:43 -0300
Subject: [PATCH 06/24] feat: minuta prompt and input query.

---
 prompts.json | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/prompts.json b/prompts.json
index d90c4c46f..0c79a8c01 100644
--- a/prompts.json
+++ b/prompts.json
@@ -1,4 +1,18 @@
 {
+    "Minuta Comprador": {
+        "latest": {
+            "prompt_minuta": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brasil). Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'Minuta'. Always simplify the answer returning a structured table with information summary.",
+            "input_minuta": "Extraia todos os dados pessoais da parte compradora (inclua nome completo) escritos no parágrafo que contém o termo 'Outorgada Compradora'. Crie uma tabela com 2 colunas: Dado e Valor."
+        }
+    },
+
+    "Minuta Vendedor": {
+        "latest": {
+            "prompt_minuta": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brasil). Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'Minuta'. Always simplify the answer returning a structured table with information summary.",
+            "input_minuta": "Extraia todos os dados pessoais da parte vendedora (inclua nome completo) escritos no parágrafo que contém o termo 'Outorgante Vendedor'. Crie uma tabela com 2 colunas: Dado e Valor."
+        }
+    },
+
     "CNH": {
         "v1": "Extraia do documento CNH os dados nos campos: Nome completo, nacionalidade, data de nascimento, RG e órgão expedidor e CPF.",
 
@@ -37,7 +51,7 @@
         "v4": "Extraia do documento 'Certidão de Casamento' os dados relacionados ao Cônjuge: Nome, Documento de Identificação e Data do Casamento. Extrair o dado sobre o tipo de Regime de Bens. Extrair o número de registro da certidão, onde a certidão foi emitida e a data de emissão da Certidão.",
 
         "latest": {
-            "prompt": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brazil) and return a structured table consolidating the information at the end of the answer. Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'Certidão de Casamento'.",
+            "prompt": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brazil). Always return a structured table gathering the information at the end. Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'Certidão de Casamento'.",
             "input": "Extraia dados relacionados ao Cônjuge: Nome, Documento de Identificação e Data do Casamento. Extrair o dado sobre o tipo de Regime de Bens. Extrair o número de registro da certidão, onde a certidão foi emitida e a data de emissão da Certidão."
         }
     }

From f41f7ffce51dce73d03640f23ce5313c5e2c7744 Mon Sep 17 00:00:00 2001
From: Leonardo Valeriano Neri <leonardovalerianoneri@gmail.com>
Date: Tue, 8 Apr 2025 19:54:57 -0300
Subject: [PATCH 07/24] feat: minuta pdf content extraction and RAG agent.

---
 multiple_files_bot.py | 164 ++++++++++++++++++++++++++++++++----------
 1 file changed, 125 insertions(+), 39 deletions(-)

diff --git a/multiple_files_bot.py b/multiple_files_bot.py
index ea30c2aca..fd73decab 100644
--- a/multiple_files_bot.py
+++ b/multiple_files_bot.py
@@ -59,7 +59,48 @@ def on_llm_new_token(self, token: str, **kwargs) -> None:
         self.container.markdown(self.text)
 
 
-def RAG_document_validator_and_text_extractor(document_name: str, uploaded_file):
+def RAG_Minuta(document_name: str, uploaded_file):
+    text = ""
+    if uploaded_file:
+        bytes_data = uploaded_file.read()
+        file_format = uploaded_file.name.split('.')[1].lower()
+        
+        match file_format:
+            case 'pdf':
+                images = convert_from_bytes(bytes_data)
+                # Somente a 1ª página
+                text += pytesseract.image_to_string(images[0], lang='por') + " \n\n"
+
+                # for i, image in enumerate(images):
+                #     text += f"Página: {i} \n\n" + pytesseract.image_to_string(image, lang='por')
+            case _:
+                st.write("Formato do arquivo:", uploaded_file.name, "não é suportado!")
+
+        # langchain_textspliter
+        text_splitter = RecursiveCharacterTextSplitter(
+                            chunk_size=10000,
+                            chunk_overlap=200,
+                            length_function=len, 
+                            separators=['\n\n', '\n']
+                        )
+        chunks = text_splitter.split_text(text=text)
+        chunks = [f"NOME_DO_DOCUMENTO: {document_name} " + chunk for chunk in chunks]
+
+        # Store the chuncks part in db (vector)
+        vectorstore = Neo4jVector.from_texts(
+            chunks,
+            url=url,
+            username=username,
+            password=password,
+            embedding=embeddings,
+            node_label=f"MultipleFilesBotChunk_{document_name}",
+            pre_delete_collection=True, # Delete existing data in collection
+        )
+
+        return vectorstore
+
+
+def RAG_agent_document_validator(document_name: str, uploaded_file):
     text = ""
     if uploaded_file:
         bytes_data = uploaded_file.read()
@@ -102,32 +143,36 @@ def RAG_document_validator_and_text_extractor(document_name: str, uploaded_file)
             pre_delete_collection=True, # Delete existing data in collection
         )
 
-        system_prompt = prompts[document_name].get('latest')['prompt'] + " Context: {context}"
-        prompt = ChatPromptTemplate(
+        agent_document_retreiver = build_RAG_agent(document_name, vectorstore)
+
+        return agent_document_retreiver
+
+
+def build_RAG_agent(document_name, vectorstore, prompt=None, history_context=""):
+    if not prompt:
+        prompt = prompts[document_name].get('latest')['prompt']
+
+    system_prompt = prompt + " Context: {context} " + history_context + " "
+    prompt = ChatPromptTemplate(
             [
                 ("system", system_prompt),
                 ("human", "{input}")
             ]
         )
-        qa_chain = create_stuff_documents_chain(llm, prompt)
-        agent_document_retreiver = create_retrieval_chain(vectorstore.as_retriever(), qa_chain)
+    qa_chain = create_stuff_documents_chain(llm, prompt)
+    agent_document_retreiver = create_retrieval_chain(vectorstore.as_retriever(), qa_chain)
+    return agent_document_retreiver
 
-        # agent_document_retreiver = RetrievalQA.from_chain_type(
-        #     llm=llm, 
-        #     chain_type="stuff", 
-        #     retriever=vectorstore.as_retriever(), 
-        #     prompt=prompt
-        # )
 
-        return agent_document_retreiver
-        
 agents = dict()
 documents_list = ['CNH', 'Comprovante de Residência', 'Certidão de Casamento']
 if 'documents' not in st.session_state:
     st.session_state.documents = []
 
 def main():
-    st.header("📄 Revise os documentos apresentados para a Escritura")
+    st.title("📄 StartLegal - Agente Revisor de Minutas")
+
+    st.subheader("Anexe a minuta de uma escritura e em seguida os documentos necessários para revisão.")
 
     with st.sidebar:
         st.title("Partes Envolvidas")
@@ -140,31 +185,72 @@ def main():
         st.subheader("Parte Vendedora")
         st.text("Documentos apresentados...")
 
-    tabs = st.tabs(documents_list)
-
-    for tab, document in zip(tabs, documents_list):
-        with tab:
-            # upload a your files
-            uploaded_file = st.file_uploader(
-                "Suba o documento em algum desses formatos: PDF, png, jpeg, ou txt.", 
-                accept_multiple_files=False,
-                type=["png", "jpg", "jpeg", "pdf", "txt"],
-                key=document
-            )
-
-            if uploaded_file:
-                st.write("A IA irá coletar e validar as informações presentes...")
-
-                # Text extraction and embedding using OCR and LLM to build a QA RAG
-                query = prompts[document].get('latest')['input']
-                agent = RAG_document_validator_and_text_extractor(document, uploaded_file)
-                answer = agent.invoke({'input': query})['answer']
-                stream_handler = StreamHandler(st.empty())
-                for token in answer:
-                    stream_handler.on_llm_new_token(token=token)
-                if document not in st.session_state.documents:
-                    st.session_state.documents.append(document)
-
+     # upload a your files
+    uploaded_file_minuta = st.file_uploader(
+        "Suba o documento da Minuta em formato PDF.", 
+        accept_multiple_files=False,
+        type="pdf",
+        key='minuta'
+    )
+
+    if uploaded_file_minuta:
+        if not st.session_state.get('rag_minuta', False):   
+            st.write("A IA irá coletar as informações presentes no documento...")
+            
+            minuta_vector_db = RAG_Minuta('Minuta', uploaded_file_minuta)
+            if 'rag_minuta' not in st.session_state:
+                st.session_state['rag_minuta'] = True
+                st.session_state.minuta_db = minuta_vector_db
+
+                # Print a table with Minuta information
+                if 'minuta_db' in st.session_state:
+                    minuta_system = prompts['Minuta Comprador'].get('latest').get('prompt_minuta', None)
+                    minuta_agent = build_RAG_agent('Minuta', st.session_state.minuta_db, minuta_system) # " gere uma tabela juntando a coluna 'Valor' das tabelas existentes."
+                    
+                    query = prompts['Minuta Comprador'].get('latest')['input_minuta']
+                    minuta_response = minuta_agent.invoke({'input': query })
+                    answer = minuta_response['answer']
+                    st.session_state.minuta_comprador = answer
+
+                    minuta_system = prompts['Minuta Vendedor'].get('latest').get('prompt_minuta', None)
+                    minuta_agent = build_RAG_agent('Minuta', st.session_state.minuta_db, minuta_system) # " gere uma tabela juntando a coluna 'Valor' das tabelas existentes."
+                    
+                    query = prompts['Minuta Vendedor'].get('latest')['input_minuta']
+                    minuta_response = minuta_agent.invoke({'input': query })
+                    answer = minuta_response['answer']
+                    st.session_state.minuta_vendedor = answer
+
+    # Activate tabs after Minuta has been processed...
+    if st.session_state.minuta:
+        tabs = st.tabs(documents_list)
+
+        for tab, document in zip(tabs, documents_list):
+            with tab:
+                # upload a your files
+                uploaded_file = st.file_uploader(
+                    "Suba o documento em algum desses formatos: PDF, png, jpeg, ou txt.", 
+                    accept_multiple_files=False,
+                    type=["png", "jpg", "jpeg", "pdf", "txt"],
+                    key=document
+                )
+
+                if uploaded_file:
+                    st.write("A IA irá coletar e validar as informações presentes...")
+
+                    # Text extraction and embedding using OCR and LLM to build a QA RAG
+                    query = prompts[document].get('latest')['input']
+                    agent = RAG_agent_document_validator(document, uploaded_file)
+                    answer = agent.invoke({'input': query})['answer']
+
+                    stream_handler = StreamHandler(st.empty())
+                    for token in answer:
+                        stream_handler.on_llm_new_token(token=token)
+                    
+                    st.write("Dados da Minuta (parte compradora)")
+
+                    stream_handler = StreamHandler(st.empty())
+                    for token in st.session_state.minuta_comprador:
+                        stream_handler.on_llm_new_token(token=token)
 
 if __name__ == "__main__":
     main()

From c95ea88744e0fa85a982a8d5fcaa342654b5d222 Mon Sep 17 00:00:00 2001
From: Leonardo Valeriano Neri <leonardovalerianoneri@gmail.com>
Date: Sun, 13 Apr 2025 11:39:45 -0300
Subject: [PATCH 08/24] feat: prompt to compare and validate data from two
 tables.

---
 multiple_files_bot.py => StartLegal.py | 143 ++++++++++++++++---------
 multiple_files_bot.Dockerfile          |   5 +-
 pages/1_Anexar_Minuta.py               |  21 ++++
 pages/2_Parte_Compradora.py            |  23 ++++
 pages/3_Parte_Vendedora.py             |  25 +++++
 5 files changed, 166 insertions(+), 51 deletions(-)
 rename multiple_files_bot.py => StartLegal.py (64%)
 create mode 100644 pages/1_Anexar_Minuta.py
 create mode 100644 pages/2_Parte_Compradora.py
 create mode 100644 pages/3_Parte_Vendedora.py

diff --git a/multiple_files_bot.py b/StartLegal.py
similarity index 64%
rename from multiple_files_bot.py
rename to StartLegal.py
index fd73decab..8cbaa5439 100644
--- a/multiple_files_bot.py
+++ b/StartLegal.py
@@ -5,12 +5,12 @@
 from pdf2image import convert_from_bytes
 from PIL import Image
 import pytesseract
-from langchain.chains import RetrievalQA
 from langchain.callbacks.base import BaseCallbackHandler
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import Neo4jVector
 from langchain_core.prompts import ChatPromptTemplate
 from langchain.chains import create_retrieval_chain
+from langchain_core.output_parsers import StrOutputParser
 from langchain.chains.combine_documents import create_stuff_documents_chain
 from streamlit.logger import get_logger
 from chains import (
@@ -21,32 +21,41 @@
 # load api key lib
 from dotenv import load_dotenv
 
-load_dotenv(".env")
-
-
-url = os.getenv("NEO4J_URI")
-username = os.getenv("NEO4J_USERNAME")
-password = os.getenv("NEO4J_PASSWORD")
-ollama_base_url = os.getenv("OLLAMA_BASE_URL")
-embedding_model_name = os.getenv("EMBEDDING_MODEL")
-llm_name = os.getenv("LLM")
-# Remapping for Langchain Neo4j integration
-os.environ["NEO4J_URL"] = url
-
 logger = get_logger(__name__)
 
 
-embeddings, dimension = load_embedding_model(
-    embedding_model_name, config={"ollama_base_url": ollama_base_url}, logger=logger
-)
-
-
-prompts = dict()
-with open('prompts.json', 'rb') as f:
-    prompts = json.load(f)
-
+def init():
+    st.session_state.url = os.getenv("NEO4J_URI")
+    st.session_state.username = os.getenv("NEO4J_USERNAME")
+    st.session_state.password = os.getenv("NEO4J_PASSWORD")
+    
+    ollama_base_url = os.getenv("OLLAMA_BASE_URL")
+    embedding_model_name = os.getenv("EMBEDDING_MODEL")
+    llm_name = os.getenv("LLM")
+    # Remapping for Langchain Neo4j integration
+    os.environ["NEO4J_URL"] = st.session_state.url
+
+    embeddings, dimension = load_embedding_model(
+        embedding_model_name, 
+        config={"ollama_base_url": ollama_base_url}, 
+        logger=logger
+    )
+    st.session_state.embeddings = embeddings
+    st.session_state.dimension = dimension
+
+    prompts = dict()
+    with open('prompts.json', 'rb') as f:
+        prompts = json.load(f)
+    
+    st.session_state.prompts = prompts
+    st.session_state.llm = load_llm(
+        llm_name, 
+        logger=logger, 
+        config={"ollama_base_url": ollama_base_url}
+    )
 
-llm = load_llm(llm_name, logger=logger, config={"ollama_base_url": ollama_base_url})
+    st.session_state.documents_list = ['CNH', 'Comprovante de Residência', 'Certidão de Casamento']
+    st.session_state.documents = []
 
 
 class StreamHandler(BaseCallbackHandler):
@@ -89,10 +98,10 @@ def RAG_Minuta(document_name: str, uploaded_file):
         # Store the chuncks part in db (vector)
         vectorstore = Neo4jVector.from_texts(
             chunks,
-            url=url,
-            username=username,
-            password=password,
-            embedding=embeddings,
+            url=st.session_state.url,
+            username=st.session_state.username,
+            password=st.session_state.password,
+            embedding=st.session_state.embeddings,
             node_label=f"MultipleFilesBotChunk_{document_name}",
             pre_delete_collection=True, # Delete existing data in collection
         )
@@ -135,10 +144,10 @@ def RAG_agent_document_validator(document_name: str, uploaded_file):
         # Store the chuncks part in db (vector)
         vectorstore = Neo4jVector.from_texts(
             chunks,
-            url=url,
-            username=username,
-            password=password,
-            embedding=embeddings,
+            url=st.session_state.url,
+            username=st.session_state.username,
+            password=st.session_state.password,
+            embedding=st.session_state.embeddings,
             node_label=f"MultipleFilesBotChunk_{document_name}",
             pre_delete_collection=True, # Delete existing data in collection
         )
@@ -150,7 +159,7 @@ def RAG_agent_document_validator(document_name: str, uploaded_file):
 
 def build_RAG_agent(document_name, vectorstore, prompt=None, history_context=""):
     if not prompt:
-        prompt = prompts[document_name].get('latest')['prompt']
+        prompt = st.session_state.prompts[document_name].get('latest')['prompt']
 
     system_prompt = prompt + " Context: {context} " + history_context + " "
     prompt = ChatPromptTemplate(
@@ -159,33 +168,37 @@ def build_RAG_agent(document_name, vectorstore, prompt=None, history_context="")
                 ("human", "{input}")
             ]
         )
-    qa_chain = create_stuff_documents_chain(llm, prompt)
+    qa_chain = create_stuff_documents_chain(st.session_state.llm, prompt)
     agent_document_retreiver = create_retrieval_chain(vectorstore.as_retriever(), qa_chain)
     return agent_document_retreiver
 
 
 agents = dict()
-documents_list = ['CNH', 'Comprovante de Residência', 'Certidão de Casamento']
-if 'documents' not in st.session_state:
-    st.session_state.documents = []
+
 
 def main():
-    st.title("📄 StartLegal - Agente Revisor de Minutas")
+    if 'init' not in st.session_state:
+        st.session_state.init = True
+        load_dotenv(".env")
+        init()
+
+    st.set_page_config(page_title="StartLegal")
 
-    st.subheader("Anexe a minuta de uma escritura e em seguida os documentos necessários para revisão.")
+    st.header("StartLegal - Módulo Revisor 📄", divider='gray')
+
+    st.subheader(
+        "Anexe a minuta de uma escritura e em seguida os documentos necessários para revisão."
+    )
 
     with st.sidebar:
         st.title("Partes Envolvidas")
-
-        st.subheader("Parte Compradora")
         st.text("Documentos apresentados:")
         for doc in st.session_state.documents:
             st.text(doc)
 
-        st.subheader("Parte Vendedora")
         st.text("Documentos apresentados...")
 
-     # upload a your files
+    # upload a your files
     uploaded_file_minuta = st.file_uploader(
         "Suba o documento da Minuta em formato PDF.", 
         accept_multiple_files=False,
@@ -204,27 +217,27 @@ def main():
 
                 # Print a table with Minuta information
                 if 'minuta_db' in st.session_state:
-                    minuta_system = prompts['Minuta Comprador'].get('latest').get('prompt_minuta', None)
+                    minuta_system = st.session_state.prompts['Minuta Comprador'].get('latest').get('prompt_minuta', None)
                     minuta_agent = build_RAG_agent('Minuta', st.session_state.minuta_db, minuta_system) # " gere uma tabela juntando a coluna 'Valor' das tabelas existentes."
                     
-                    query = prompts['Minuta Comprador'].get('latest')['input_minuta']
+                    query = st.session_state.prompts['Minuta Comprador'].get('latest')['input_minuta']
                     minuta_response = minuta_agent.invoke({'input': query })
                     answer = minuta_response['answer']
                     st.session_state.minuta_comprador = answer
 
-                    minuta_system = prompts['Minuta Vendedor'].get('latest').get('prompt_minuta', None)
+                    minuta_system = st.session_state.prompts['Minuta Vendedor'].get('latest').get('prompt_minuta', None)
                     minuta_agent = build_RAG_agent('Minuta', st.session_state.minuta_db, minuta_system) # " gere uma tabela juntando a coluna 'Valor' das tabelas existentes."
                     
-                    query = prompts['Minuta Vendedor'].get('latest')['input_minuta']
+                    query = st.session_state.prompts['Minuta Vendedor'].get('latest')['input_minuta']
                     minuta_response = minuta_agent.invoke({'input': query })
                     answer = minuta_response['answer']
                     st.session_state.minuta_vendedor = answer
 
     # Activate tabs after Minuta has been processed...
     if st.session_state.minuta:
-        tabs = st.tabs(documents_list)
+        tabs = st.tabs(st.session_state.documents_list)
 
-        for tab, document in zip(tabs, documents_list):
+        for tab, document in zip(tabs, st.session_state.documents_list):
             with tab:
                 # upload a your files
                 uploaded_file = st.file_uploader(
@@ -238,7 +251,7 @@ def main():
                     st.write("A IA irá coletar e validar as informações presentes...")
 
                     # Text extraction and embedding using OCR and LLM to build a QA RAG
-                    query = prompts[document].get('latest')['input']
+                    query = st.session_state.prompts[document].get('latest')['input']
                     agent = RAG_agent_document_validator(document, uploaded_file)
                     answer = agent.invoke({'input': query})['answer']
 
@@ -246,11 +259,43 @@ def main():
                     for token in answer:
                         stream_handler.on_llm_new_token(token=token)
                     
+                    # Visualize data from Minuta document
                     st.write("Dados da Minuta (parte compradora)")
 
                     stream_handler = StreamHandler(st.empty())
                     for token in st.session_state.minuta_comprador:
                         stream_handler.on_llm_new_token(token=token)
 
+                    # Ask to LLM a table showing the Document data and Minuta data
+                    st.write(f"Validando de {document} com os dados da Minuta.")
+                    
+                    context = "Primeira tabela " + \
+                            answer + "| Segunda tabela " + \
+                            st.session_state.minuta_comprador
+
+                    system_prompt = """ 
+                    Dentro do contexto existem duas tabelas. 
+                    Compare os dados em comum e responda ao usuário através de uma 
+                    tabela com uma coluna indicando se existe diferença de escrita 
+                    entre a primeira e segunda tabelas. 
+                    Ignore diferenças causadas por Letras Maiúsculas e Minúsculas, 
+                    ou símbolos '.', '-', ou '/'.
+                    """ + f" Contexto: {context} "
+                    prompt = ChatPromptTemplate(
+                            [
+                                ("system", system_prompt),
+                                ("human", "{input}")
+                            ]
+                        )
+                    
+                    chain = prompt | st.session_state.llm | StrOutputParser()
+
+                    final_answer = chain.invoke("Valide os dados do {document} com os dados da Minuta.")
+
+                    stream_handler = StreamHandler(st.empty())
+                    for token in final_answer:
+                        stream_handler.on_llm_new_token(token=token)
+
+
 if __name__ == "__main__":
     main()
diff --git a/multiple_files_bot.Dockerfile b/multiple_files_bot.Dockerfile
index ad050dc91..7142d2703 100644
--- a/multiple_files_bot.Dockerfile
+++ b/multiple_files_bot.Dockerfile
@@ -18,7 +18,8 @@ COPY requirements.txt .
 
 RUN pip install --upgrade -r requirements.txt
 
-COPY multiple_files_bot.py .
+ADD pages pages
+COPY StartLegal.py .
 COPY prompts.json .
 COPY utils.py .
 COPY chains.py .
@@ -27,4 +28,4 @@ EXPOSE 8505
 
 HEALTHCHECK CMD curl --fail http://localhost:8503/_stcore/health
 
-ENTRYPOINT ["streamlit", "run", "multiple_files_bot.py", "--server.port=8505", "--server.address=0.0.0.0"]
+ENTRYPOINT ["streamlit", "run", "StartLegal.py", "--server.port=8505", "--server.address=0.0.0.0"]
diff --git a/pages/1_Anexar_Minuta.py b/pages/1_Anexar_Minuta.py
new file mode 100644
index 000000000..753552e12
--- /dev/null
+++ b/pages/1_Anexar_Minuta.py
@@ -0,0 +1,21 @@
+import streamlit as st
+
+st.set_page_config(page_title="StartLegal - Anexar a Minuta")
+
+st.sidebar.header("Dados obtidos da Minuta")
+
+st.subheader(
+    "Anexe a minuta da escritura para iniciar a revisão.",
+    divider='gray'
+)
+
+# upload a your files
+uploaded_file_minuta = st.file_uploader(
+    "Suba o documento da Minuta em formato PDF.", 
+    accept_multiple_files=False,
+    type="pdf",
+    key='minuta'
+)
+
+if uploaded_file_minuta:
+    st.write('Minuta Anexada!')
\ No newline at end of file
diff --git a/pages/2_Parte_Compradora.py b/pages/2_Parte_Compradora.py
new file mode 100644
index 000000000..0e9b5d3fb
--- /dev/null
+++ b/pages/2_Parte_Compradora.py
@@ -0,0 +1,23 @@
+import streamlit as st
+
+st.session_state.buyer_documents_list = [
+    'CNH', 
+    'Comprovante de Residência', 
+    'Certidão de Casamento'
+]
+
+# Define a list of Documents at app init() method
+tabs = st.tabs(st.session_state.buyer_documents_list)
+
+for tab, document in zip(tabs, st.session_state.buyer_documents_list):
+    with tab:
+        # upload a your files
+        uploaded_file = st.file_uploader(
+            "Suba o documento em algum desses formatos: PDF, png, jpeg, ou txt.", 
+            accept_multiple_files=False,
+            type=["png", "jpg", "jpeg", "pdf", "txt"],
+            key=document
+        )
+    
+    if uploaded_file:
+        st.write("A IA irá coletar e validar as informações presentes...")
\ No newline at end of file
diff --git a/pages/3_Parte_Vendedora.py b/pages/3_Parte_Vendedora.py
new file mode 100644
index 000000000..b90aaadba
--- /dev/null
+++ b/pages/3_Parte_Vendedora.py
@@ -0,0 +1,25 @@
+import streamlit as st
+
+st.session_state.owner_documents_list = [
+    'CNH', 
+    'Comprovante de Residência', 
+    'Certidão de Casamento'
+]
+
+
+
+# Define a list of Documents at app init() method
+tabs = st.tabs(st.session_state.owner_documents_list)
+
+for tab, document in zip(tabs, st.session_state.owner_documents_list):
+    with tab:
+        # upload a your files
+        uploaded_file = st.file_uploader(
+            "Suba o documento em algum desses formatos: PDF, png, jpeg, ou txt.", 
+            accept_multiple_files=False,
+            type=["png", "jpg", "jpeg", "pdf", "txt"],
+            key=document
+        )
+    
+    if uploaded_file:
+        st.write("A IA irá coletar e validar as informações presentes...")
\ No newline at end of file

From df0b7eb7a49fcb66167bfc19e1aef7d0eea9be5e Mon Sep 17 00:00:00 2001
From: Leonardo Valeriano Neri <leonardovalerianoneri@gmail.com>
Date: Wed, 16 Apr 2025 16:38:39 -0300
Subject: [PATCH 09/24] refactor:add package for rag systems and split steps on
 3 pages.

---
 StartLegal.py                   | 301 ++++++++------------------------
 __init__.py                     |   0
 multiple_files_bot.Dockerfile   |   2 +
 pages/1_Anexar_Minuta.py        |  75 +++++++-
 pages/2_Parte_Compradora.py     | 107 +++++++++++-
 prompts.json                    |   4 +-
 rag_utils/__init__.py           |   0
 rag_utils/config.py             |  52 ++++++
 rag_utils/content_indexing.py   | 127 ++++++++++++++
 rag_utils/document_retrieval.py |  31 ++++
 10 files changed, 465 insertions(+), 234 deletions(-)
 create mode 100644 __init__.py
 create mode 100644 rag_utils/__init__.py
 create mode 100644 rag_utils/config.py
 create mode 100644 rag_utils/content_indexing.py
 create mode 100644 rag_utils/document_retrieval.py

diff --git a/StartLegal.py b/StartLegal.py
index 8cbaa5439..d5dc9a451 100644
--- a/StartLegal.py
+++ b/StartLegal.py
@@ -21,19 +21,21 @@
 # load api key lib
 from dotenv import load_dotenv
 
+
 logger = get_logger(__name__)
 
 
 def init():
-    st.session_state.url = os.getenv("NEO4J_URI")
-    st.session_state.username = os.getenv("NEO4J_USERNAME")
-    st.session_state.password = os.getenv("NEO4J_PASSWORD")
+    st.session_state.vectorstore_config = dict()
+    st.session_state.vectorstore_config['url'] = os.getenv("NEO4J_URI")
+    st.session_state.vectorstore_config['username'] = os.getenv("NEO4J_USERNAME")
+    st.session_state.vectorstore_config['password'] = os.getenv("NEO4J_PASSWORD")
     
     ollama_base_url = os.getenv("OLLAMA_BASE_URL")
     embedding_model_name = os.getenv("EMBEDDING_MODEL")
     llm_name = os.getenv("LLM")
     # Remapping for Langchain Neo4j integration
-    os.environ["NEO4J_URL"] = st.session_state.url
+    os.environ["NEO4J_URL"] = st.session_state.vectorstore_config['url']
 
     embeddings, dimension = load_embedding_model(
         embedding_model_name, 
@@ -68,234 +70,85 @@ def on_llm_new_token(self, token: str, **kwargs) -> None:
         self.container.markdown(self.text)
 
 
-def RAG_Minuta(document_name: str, uploaded_file):
-    text = ""
-    if uploaded_file:
-        bytes_data = uploaded_file.read()
-        file_format = uploaded_file.name.split('.')[1].lower()
-        
-        match file_format:
-            case 'pdf':
-                images = convert_from_bytes(bytes_data)
-                # Somente a 1ª página
-                text += pytesseract.image_to_string(images[0], lang='por') + " \n\n"
-
-                # for i, image in enumerate(images):
-                #     text += f"Página: {i} \n\n" + pytesseract.image_to_string(image, lang='por')
-            case _:
-                st.write("Formato do arquivo:", uploaded_file.name, "não é suportado!")
-
-        # langchain_textspliter
-        text_splitter = RecursiveCharacterTextSplitter(
-                            chunk_size=10000,
-                            chunk_overlap=200,
-                            length_function=len, 
-                            separators=['\n\n', '\n']
-                        )
-        chunks = text_splitter.split_text(text=text)
-        chunks = [f"NOME_DO_DOCUMENTO: {document_name} " + chunk for chunk in chunks]
-
-        # Store the chuncks part in db (vector)
-        vectorstore = Neo4jVector.from_texts(
-            chunks,
-            url=st.session_state.url,
-            username=st.session_state.username,
-            password=st.session_state.password,
-            embedding=st.session_state.embeddings,
-            node_label=f"MultipleFilesBotChunk_{document_name}",
-            pre_delete_collection=True, # Delete existing data in collection
-        )
-
-        return vectorstore
-
-
-def RAG_agent_document_validator(document_name: str, uploaded_file):
-    text = ""
-    if uploaded_file:
-        bytes_data = uploaded_file.read()
-        file_format = uploaded_file.name.split('.')[1].lower()
-        
-        match file_format:
-            case 'pdf':
-                images = convert_from_bytes(bytes_data)
-                for i, image in enumerate(images):
-                    text += f"Página: {i} \n\n" + pytesseract.image_to_string(image, lang='por')
-            case 'txt':
-                for line in uploaded_file:
-                    text += line
-            case 'png':
-                text += pytesseract.image_to_string(Image.open(uploaded_file), lang='por')
-            case 'jpg':
-                text += pytesseract.image_to_string(Image.open(uploaded_file), lang='por')
-            case 'jpeg':
-                text += pytesseract.image_to_string(Image.open(uploaded_file), lang='por')
-            case _:
-                st.write("Formato do arquivo:", uploaded_file.name, "não é suportado!")
-
-        # langchain_textspliter
-        text_splitter = RecursiveCharacterTextSplitter(
-                            chunk_size=10000,
-                            chunk_overlap=200,
-                            length_function=len
-                        )
-        chunks = text_splitter.split_text(text=text)
-        chunks = [f"NOME_DO_DOCUMENTO: {document_name} " + chunk for chunk in chunks]
-
-        # Store the chuncks part in db (vector)
-        vectorstore = Neo4jVector.from_texts(
-            chunks,
-            url=st.session_state.url,
-            username=st.session_state.username,
-            password=st.session_state.password,
-            embedding=st.session_state.embeddings,
-            node_label=f"MultipleFilesBotChunk_{document_name}",
-            pre_delete_collection=True, # Delete existing data in collection
-        )
-
-        agent_document_retreiver = build_RAG_agent(document_name, vectorstore)
-
-        return agent_document_retreiver
-
-
-def build_RAG_agent(document_name, vectorstore, prompt=None, history_context=""):
-    if not prompt:
-        prompt = st.session_state.prompts[document_name].get('latest')['prompt']
-
-    system_prompt = prompt + " Context: {context} " + history_context + " "
-    prompt = ChatPromptTemplate(
-            [
-                ("system", system_prompt),
-                ("human", "{input}")
-            ]
-        )
-    qa_chain = create_stuff_documents_chain(st.session_state.llm, prompt)
-    agent_document_retreiver = create_retrieval_chain(vectorstore.as_retriever(), qa_chain)
-    return agent_document_retreiver
-
-
 agents = dict()
 
+if 'init' not in st.session_state:
+    st.session_state.init = True
+    load_dotenv(".env")
+    init()
 
-def main():
-    if 'init' not in st.session_state:
-        st.session_state.init = True
-        load_dotenv(".env")
-        init()
+st.set_page_config(page_title="StartLegal")
 
-    st.set_page_config(page_title="StartLegal")
+st.header("StartLegal - Módulo Revisor 📄", divider='gray')
 
-    st.header("StartLegal - Módulo Revisor 📄", divider='gray')
-
-    st.subheader(
-        "Anexe a minuta de uma escritura e em seguida os documentos necessários para revisão."
-    )
-
-    with st.sidebar:
-        st.title("Partes Envolvidas")
-        st.text("Documentos apresentados:")
-        for doc in st.session_state.documents:
-            st.text(doc)
-
-        st.text("Documentos apresentados...")
-
-    # upload a your files
-    uploaded_file_minuta = st.file_uploader(
-        "Suba o documento da Minuta em formato PDF.", 
-        accept_multiple_files=False,
-        type="pdf",
-        key='minuta'
-    )
+st.subheader(
+    "Anexe a minuta de uma escritura e em seguida os documentos necessários para revisão."
+)
 
-    if uploaded_file_minuta:
-        if not st.session_state.get('rag_minuta', False):   
-            st.write("A IA irá coletar as informações presentes no documento...")
+with st.sidebar:
+    st.title("Etapas de Revisão")
+#     st.text("Documentos apresentados:")
+#     for doc in st.session_state.documents:
+#         st.text(doc)
+
+#     st.text("Documentos apresentados...")
+
+# tabs = st.tabs(st.session_state.documents_list)
+
+# for tab, document in zip(tabs, st.session_state.documents_list):
+#     with tab:
+#         # upload a your files
+#         uploaded_file = st.file_uploader(
+#             "Suba o documento em algum desses formatos: PDF, png, jpeg, ou txt.", 
+#             accept_multiple_files=False,
+#             type=["png", "jpg", "jpeg", "pdf", "txt"],
+#             key=document
+#         )
+
+#         if uploaded_file:
+#             st.write("A IA irá coletar e validar as informações presentes...")
+
+#             # Text extraction and embedding using OCR and LLM to build a QA RAG
+#             query = st.session_state.prompts[document].get('latest')['input']
+#             agent = RAG_agent_document_validator(document, uploaded_file)
+#             answer = agent.invoke({'input': query})['answer']
+
+#             stream_handler = StreamHandler(st.empty())
+#             for token in answer:
+#                 stream_handler.on_llm_new_token(token=token)
             
-            minuta_vector_db = RAG_Minuta('Minuta', uploaded_file_minuta)
-            if 'rag_minuta' not in st.session_state:
-                st.session_state['rag_minuta'] = True
-                st.session_state.minuta_db = minuta_vector_db
-
-                # Print a table with Minuta information
-                if 'minuta_db' in st.session_state:
-                    minuta_system = st.session_state.prompts['Minuta Comprador'].get('latest').get('prompt_minuta', None)
-                    minuta_agent = build_RAG_agent('Minuta', st.session_state.minuta_db, minuta_system) # " gere uma tabela juntando a coluna 'Valor' das tabelas existentes."
-                    
-                    query = st.session_state.prompts['Minuta Comprador'].get('latest')['input_minuta']
-                    minuta_response = minuta_agent.invoke({'input': query })
-                    answer = minuta_response['answer']
-                    st.session_state.minuta_comprador = answer
-
-                    minuta_system = st.session_state.prompts['Minuta Vendedor'].get('latest').get('prompt_minuta', None)
-                    minuta_agent = build_RAG_agent('Minuta', st.session_state.minuta_db, minuta_system) # " gere uma tabela juntando a coluna 'Valor' das tabelas existentes."
-                    
-                    query = st.session_state.prompts['Minuta Vendedor'].get('latest')['input_minuta']
-                    minuta_response = minuta_agent.invoke({'input': query })
-                    answer = minuta_response['answer']
-                    st.session_state.minuta_vendedor = answer
-
-    # Activate tabs after Minuta has been processed...
-    if st.session_state.minuta:
-        tabs = st.tabs(st.session_state.documents_list)
-
-        for tab, document in zip(tabs, st.session_state.documents_list):
-            with tab:
-                # upload a your files
-                uploaded_file = st.file_uploader(
-                    "Suba o documento em algum desses formatos: PDF, png, jpeg, ou txt.", 
-                    accept_multiple_files=False,
-                    type=["png", "jpg", "jpeg", "pdf", "txt"],
-                    key=document
-                )
+#             # Visualize data from Minuta document
+#             st.write("Dados da Minuta (parte compradora)")
 
-                if uploaded_file:
-                    st.write("A IA irá coletar e validar as informações presentes...")
+#             stream_handler = StreamHandler(st.empty())
+#             for token in st.session_state.minuta_comprador:
+#                 stream_handler.on_llm_new_token(token=token)
 
-                    # Text extraction and embedding using OCR and LLM to build a QA RAG
-                    query = st.session_state.prompts[document].get('latest')['input']
-                    agent = RAG_agent_document_validator(document, uploaded_file)
-                    answer = agent.invoke({'input': query})['answer']
-
-                    stream_handler = StreamHandler(st.empty())
-                    for token in answer:
-                        stream_handler.on_llm_new_token(token=token)
-                    
-                    # Visualize data from Minuta document
-                    st.write("Dados da Minuta (parte compradora)")
-
-                    stream_handler = StreamHandler(st.empty())
-                    for token in st.session_state.minuta_comprador:
-                        stream_handler.on_llm_new_token(token=token)
-
-                    # Ask to LLM a table showing the Document data and Minuta data
-                    st.write(f"Validando de {document} com os dados da Minuta.")
-                    
-                    context = "Primeira tabela " + \
-                            answer + "| Segunda tabela " + \
-                            st.session_state.minuta_comprador
-
-                    system_prompt = """ 
-                    Dentro do contexto existem duas tabelas. 
-                    Compare os dados em comum e responda ao usuário através de uma 
-                    tabela com uma coluna indicando se existe diferença de escrita 
-                    entre a primeira e segunda tabelas. 
-                    Ignore diferenças causadas por Letras Maiúsculas e Minúsculas, 
-                    ou símbolos '.', '-', ou '/'.
-                    """ + f" Contexto: {context} "
-                    prompt = ChatPromptTemplate(
-                            [
-                                ("system", system_prompt),
-                                ("human", "{input}")
-                            ]
-                        )
-                    
-                    chain = prompt | st.session_state.llm | StrOutputParser()
-
-                    final_answer = chain.invoke("Valide os dados do {document} com os dados da Minuta.")
-
-                    stream_handler = StreamHandler(st.empty())
-                    for token in final_answer:
-                        stream_handler.on_llm_new_token(token=token)
+#             # Ask to LLM a table showing the Document data and Minuta data
+#             st.write(f"Validando de {document} com os dados da Minuta.")
+            
+#             context = "Primeira tabela " + \
+#                     answer + "| Segunda tabela " + \
+#                     st.session_state.minuta_comprador
+
+#             system_prompt = """ 
+#             Você é um assistente que revisa documentos e precisa auxiliar o usuário que faz o trabalho manual 
+#             de checar se dados que foram escritos na Minuta estão escritos da mesma forma que nos documentos de origem. 
+#             O usuário fornecerá duas tabelas após o termo 'Contexto'.
+#             Responda gerando uma tabela que compara apenas os dados dessas duas tabelas fornecidas.
+#             Ignore diferenças de letras maiúsculas e minúsculas, ou que tenham símbolos '.', '-', ou '/'. 
+#             """ + f" Contexto: {context} "
+#             prompt = ChatPromptTemplate(
+#                     [
+#                         ("system", system_prompt),
+#                         ("human", "{input}")
+#                     ]
+#                 )
+            
+#             chain = prompt | st.session_state.llm | StrOutputParser()
 
+#             final_answer = chain.invoke("Compare apenas os dados do {document} os quais também estejam presentes na Minuta.")
 
-if __name__ == "__main__":
-    main()
+#             stream_handler = StreamHandler(st.empty())
+#             for token in final_answer:
+#                 stream_handler.on_llm_new_token(token=token)
\ No newline at end of file
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/multiple_files_bot.Dockerfile b/multiple_files_bot.Dockerfile
index 7142d2703..360ea3f7e 100644
--- a/multiple_files_bot.Dockerfile
+++ b/multiple_files_bot.Dockerfile
@@ -18,7 +18,9 @@ COPY requirements.txt .
 
 RUN pip install --upgrade -r requirements.txt
 
+ADD rag_utils rag_utils
 ADD pages pages
+COPY __init__.py .
 COPY StartLegal.py .
 COPY prompts.json .
 COPY utils.py .
diff --git a/pages/1_Anexar_Minuta.py b/pages/1_Anexar_Minuta.py
index 753552e12..74eca9a69 100644
--- a/pages/1_Anexar_Minuta.py
+++ b/pages/1_Anexar_Minuta.py
@@ -1,5 +1,18 @@
 import streamlit as st
+import logging
+from streamlit.logger import get_logger
+from rag_utils.config import init
+from rag_utils.content_indexing import document_encoder_retriever
+from rag_utils.document_retrieval import build_agent
 
+logging.basicConfig(level = logging.INFO)
+
+logger = get_logger(__name__)
+
+if 'init' not in st.session_state:
+    st.session_state.init = True
+    init()
+    
 st.set_page_config(page_title="StartLegal - Anexar a Minuta")
 
 st.sidebar.header("Dados obtidos da Minuta")
@@ -18,4 +31,64 @@
 )
 
 if uploaded_file_minuta:
-    st.write('Minuta Anexada!')
\ No newline at end of file
+
+    if 'rag_minuta' not in st.session_state:
+        st.write("A IA irá coletar as informações presentes no documento...")
+        st.session_state['rag_minuta'] = True
+
+        minuta_retriever = document_encoder_retriever(
+            document_name='Minuta', 
+            uploaded_file=uploaded_file_minuta,
+            ocr_params={
+                'pages': [0],
+                'lang': 'por'
+            }, 
+            logger=logger, 
+            embeddings=st.session_state.embeddings,
+            vectorstore_config=st.session_state.vectorstore_config
+        )
+
+        st.session_state.minuta_db = minuta_retriever
+
+        minuta_system = st.session_state.prompts['Minuta Comprador'].get('latest').get('prompt_minuta', None)
+        
+        minuta_agent = build_agent(
+            prompt=minuta_system, 
+            vectorstore=minuta_retriever,
+            logger=logger,
+            history_context="",
+            llm=st.session_state.llm
+        )
+
+        query = st.session_state.prompts['Minuta Comprador'].get('latest')['input_minuta']
+        logger.info(f"{query}")
+        
+        minuta_response = minuta_agent.invoke({'input': query })
+        
+        answer = minuta_response['answer']
+        
+        st.session_state.minuta_comprador = answer
+
+        minuta_system_owner = st.session_state.prompts['Minuta Vendedor'].get('latest').get('prompt_minuta', None)
+        
+        minuta_agent_owner = build_agent(
+            prompt=minuta_system_owner, 
+            vectorstore=minuta_retriever,
+            logger=logger,
+            history_context="",
+            llm=st.session_state.llm
+        )
+
+        query_owner = st.session_state.prompts['Minuta Vendedor'].get('latest')['input_minuta']
+        
+        minuta_response_owner = minuta_agent_owner.invoke({'input': query_owner })
+        
+        answer_owner = minuta_response_owner['answer']
+        
+        st.session_state.minuta_vendedor = answer_owner
+
+if 'minuta_comprador' in st.session_state:
+    st.write(st.session_state.minuta_comprador)
+
+if 'minuta_vendedor' in st.session_state:
+    st.write(st.session_state.minuta_vendedor)
\ No newline at end of file
diff --git a/pages/2_Parte_Compradora.py b/pages/2_Parte_Compradora.py
index 0e9b5d3fb..82315c54c 100644
--- a/pages/2_Parte_Compradora.py
+++ b/pages/2_Parte_Compradora.py
@@ -1,10 +1,42 @@
 import streamlit as st
+from streamlit.logger import get_logger
+import logging
+from langchain.callbacks.base import BaseCallbackHandler
+from rag_utils.config import init
+from rag_utils.content_indexing import document_encoder_retriever
+from rag_utils.document_retrieval import build_agent
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
 
-st.session_state.buyer_documents_list = [
-    'CNH', 
-    'Comprovante de Residência', 
-    'Certidão de Casamento'
-]
+
+logging.basicConfig(level = logging.INFO)
+
+
+class StreamHandler(BaseCallbackHandler):
+    def __init__(self, container, initial_text=""):
+        self.container = container
+        self.text = initial_text
+
+    def on_llm_new_token(self, token: str, **kwargs) -> None:
+        self.text += token
+        self.container.markdown(self.text)
+
+
+if 'init' not in st.session_state:
+    st.session_state.init = True
+    init()
+
+if 'init_buyer_review_page' not in st.session_state:
+    st.session_state.init_buyer_review_page = True
+
+    st.session_state.buyer_documents_list = [
+        'CNH', 
+        'Comprovante de Residência', 
+        'Certidão de Casamento'
+    ]
+    st.session_state.final_answer = dict().fromkeys(st.session_state.buyer_documents_list)
+
+logger = get_logger(__name__)
 
 # Define a list of Documents at app init() method
 tabs = st.tabs(st.session_state.buyer_documents_list)
@@ -19,5 +51,66 @@
             key=document
         )
     
-    if uploaded_file:
-        st.write("A IA irá coletar e validar as informações presentes...")
\ No newline at end of file
+        if uploaded_file:
+            st.write("A IA irá coletar e validar as informações presentes...")
+
+            # Text extraction and embedding using OCR and LLM to build a QA RAG
+            document_retriever = document_encoder_retriever(
+                document_name=document, 
+                uploaded_file=uploaded_file,
+                ocr_params={
+                    'pages': None, # All pages
+                    'lang': 'por'
+                }, 
+                logger=logger, 
+                embeddings=st.session_state.embeddings,
+                vectorstore_config=st.session_state.vectorstore_config
+            )
+
+            # prepare prompt with instructions
+            instructions = st.session_state.prompts[document].get('latest')['prompt']
+            agent = build_agent(
+                prompt=instructions, 
+                vectorstore=document_retriever, 
+                logger=logger, 
+                llm=st.session_state.llm
+            )
+
+            query = st.session_state.prompts[document].get('latest')['input']
+            answer = agent.invoke({'input': query})['answer']
+            stream_handler = StreamHandler(st.empty())
+            for token in answer:
+                stream_handler.on_llm_new_token(token=token)
+
+            # Ask to LLM a table showing the Document data and Minuta data
+            st.write(f"Validando de {document} com os dados da Minuta.")
+            
+            context = "Primeira tabela " + \
+                    answer + "| Segunda tabela " + \
+                    st.session_state.minuta_comprador
+
+            system_prompt = """ 
+            Você é um assistente que revisa documentos e precisa auxiliar o usuário que faz o trabalho manual 
+            de checar se dados que foram escritos na Minuta estão escritos da mesma forma que nos documentos de origem. 
+            O usuário fornecerá duas tabelas após o termo 'Contexto'.
+            Responda gerando uma tabela que compara apenas os dados dessas duas tabelas fornecidas.
+            Ignore diferenças de letras maiúsculas e minúsculas, ou que tenham símbolos '.', '-', ou '/'. 
+            """ + f" Contexto: {context} "
+            prompt = ChatPromptTemplate(
+                    [
+                        ("system", system_prompt),
+                        ("human", "{input}")
+                    ]
+                )
+            
+            chain = prompt | st.session_state.llm | StrOutputParser()
+
+            final_answer = chain.invoke("Compare apenas os dados do {document} os quais também estejam presentes na Minuta.")
+            st.session_state.final_answer[document] = final_answer
+
+            stream_handler = StreamHandler(st.empty())
+            for token in final_answer:
+                stream_handler.on_llm_new_token(token=token)
+        else:
+            if st.session_state.final_answer[document]:
+                st.write(st.session_state.final_answer[document])
\ No newline at end of file
diff --git a/prompts.json b/prompts.json
index 0c79a8c01..a751cf982 100644
--- a/prompts.json
+++ b/prompts.json
@@ -2,7 +2,7 @@
     "Minuta Comprador": {
         "latest": {
             "prompt_minuta": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brasil). Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'Minuta'. Always simplify the answer returning a structured table with information summary.",
-            "input_minuta": "Extraia todos os dados pessoais da parte compradora (inclua nome completo) escritos no parágrafo que contém o termo 'Outorgada Compradora'. Crie uma tabela com 2 colunas: Dado e Valor."
+            "input_minuta": "Extraia todos os dados pessoais (inclua nome completo) e de endereço da parte compradora. Esses dados estão escritos no parágrafo que contém o termo 'Outorgada Compradora'. Crie uma tabela com 2 colunas: Dado e Valor."
         }
     },
 
@@ -35,7 +35,7 @@
 
         "latest": {
             "prompt":"You are an assistant for question-answering tasks. Always answer in Portuguese (Brasil). Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'Comprovante de Residência'. Always return a structured table consolidating the information in the end of the answer.",
-            "input": "Extraia os dados relacionados a endereço e CEP (Caixa Postal/ZIP Code). Apresente essas informações por extenso sem as abreviações de avenida, rua e apartamento/lote. Verifique se a data de envio do documento possui até 30 dias de diferença da data atual: 25/02/2025 (em caso negativo, retorne DOCUMENTO INVÁLIDO)." 
+            "input": "Extraia os dados relacionados a endereço e CEP (Caixa Postal/ZIP Code). Gere uma tabela as informações a seguir (por linha): endereço, cidade, estado, país. Verifique se a data de envio do documento possui até 30 dias de diferença da data atual: 25/02/2025 (em caso negativo, retorne DOCUMENTO INVÁLIDO)." 
         },
 
         "resposta": "Nome: WILSON PEREIRA DE LIMA; Endereço: Rua Setubal, 1245 - Apartamento 1402; CEP (Caixa Postal): 51130-010; Localização: Recife - PE."
diff --git a/rag_utils/__init__.py b/rag_utils/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/rag_utils/config.py b/rag_utils/config.py
new file mode 100644
index 000000000..6942f971f
--- /dev/null
+++ b/rag_utils/config.py
@@ -0,0 +1,52 @@
+import streamlit as st
+from streamlit.logger import get_logger
+import os
+import json
+
+from chains import (
+    load_embedding_model,
+    load_llm,
+)
+
+
+logger = get_logger(__name__)
+
+# load api key lib
+from dotenv import load_dotenv
+
+
+def init():
+    load_dotenv(".env")
+
+    st.session_state.vectorstore_config = dict()
+    st.session_state.vectorstore_config['url'] = os.getenv("NEO4J_URI")
+    st.session_state.vectorstore_config['username'] = os.getenv("NEO4J_USERNAME")
+    st.session_state.vectorstore_config['password'] = os.getenv("NEO4J_PASSWORD")
+    
+    ollama_base_url = os.getenv("OLLAMA_BASE_URL")
+    embedding_model_name = os.getenv("EMBEDDING_MODEL")
+    llm_name = os.getenv("LLM")
+    # Remapping for Langchain Neo4j integration
+    os.environ["NEO4J_URL"] = st.session_state.vectorstore_config['url']
+
+    embeddings, dimension = load_embedding_model(
+        embedding_model_name, 
+        config={"ollama_base_url": ollama_base_url}, 
+        logger=logger
+    )
+    st.session_state.embeddings = embeddings
+    st.session_state.dimension = dimension
+
+    prompts = dict()
+    with open('prompts.json', 'rb') as f:
+        prompts = json.load(f)
+    
+    st.session_state.prompts = prompts
+    st.session_state.llm = load_llm(
+        llm_name, 
+        logger=logger, 
+        config={"ollama_base_url": ollama_base_url}
+    )
+
+    st.session_state.documents_list = ['CNH', 'Comprovante de Residência', 'Certidão de Casamento']
+    st.session_state.documents = []
diff --git a/rag_utils/content_indexing.py b/rag_utils/content_indexing.py
new file mode 100644
index 000000000..642fd6e06
--- /dev/null
+++ b/rag_utils/content_indexing.py
@@ -0,0 +1,127 @@
+from pdf2image import convert_from_bytes
+from PIL import Image
+import pytesseract
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import Neo4jVector
+import logging
+
+
+def document_encoder_retriever(
+        document_name: str, 
+        uploaded_file,
+        ocr_params: dict,
+        logger: logging.Logger, 
+        vectorstore_config: dict, 
+        embeddings
+):
+    '''
+    Indexing Phase:
+        Documents are transformed into vector representations using dense embeddings.
+        These vectors are stored in a vector database.
+    '''
+
+    ocr_pages = ocr_params.get('pages', None)
+    ocr_lang = ocr_params.get('lang', None)
+    
+    if uploaded_file:
+        bytes_data = uploaded_file.read()
+        file_format = uploaded_file.name.split('.')[1].lower()
+        
+        # Extract text from document
+        if ocr_lang and type(ocr_lang) == str:
+            text = documents_OCR(
+                uploaded_file, 
+                logger, 
+                bytes_data, 
+                file_format, 
+                pages=ocr_pages, 
+                lang=ocr_lang
+            )    
+        else:
+            text = documents_OCR(uploaded_file, logger, bytes_data, file_format)
+        
+        # langchain_textspliter
+        chunks = text_chunking(document_name, text)
+
+        # Store the chuncks part in db (vector)
+        vectorstore = build_vectorstore(document_name, vectorstore_config, embeddings, chunks)
+
+        return vectorstore
+
+
+def documents_OCR(uploaded_file, logger, bytes_data, file_format, pages=None, lang='por'):
+    '''
+    OCR Step:
+        Extract text from PDFs, images and txt files. 
+    '''
+    text = ""
+
+    match file_format:
+        case 'pdf':
+            images = convert_from_bytes(bytes_data)
+
+            if not pages:
+                pages = list(range(len(images)))
+
+            for i, image in enumerate(images):
+                if i not in pages:
+                    continue
+
+                text += f"Página: {i} \n\n" + pytesseract.image_to_string(image, lang=lang)
+
+        case 'txt':
+            for line in uploaded_file:
+                text += line
+
+        case 'png':
+            text += pytesseract.image_to_string(Image.open(uploaded_file), lang=lang)
+
+        case 'jpg':
+            text += pytesseract.image_to_string(Image.open(uploaded_file), lang=lang)
+
+        case 'jpeg':
+            text += pytesseract.image_to_string(Image.open(uploaded_file), lang=lang)
+
+        case _:
+            logger.error(f"Formato do arquivo: {uploaded_file.name} não é suportado!")
+            
+    return text
+
+
+def text_chunking(document_name, text, size=10000, overlap=200, text_splitter=None):
+    '''
+    Chuncking Step:
+        Split document content into smaller segments called chunks. 
+        These can be paragraphs, sentences, or token-limited segments, making it easier for the model to search and retrieve only what's needed. 
+        The chunking technique is crucial for optimizing RAG performance.
+    '''
+    if not text_splitter:
+        text_splitter = RecursiveCharacterTextSplitter(
+                                chunk_size=size,
+                                chunk_overlap=overlap,
+                                length_function=len, 
+                                separators=['\n\n', '\n']
+                        )
+    
+    chunks = text_splitter.split_text(text=text)
+    chunks = [f"NOME_DO_DOCUMENTO: {document_name} " + chunk for chunk in chunks]
+    return chunks
+
+
+def build_vectorstore(reference_name, vectorstore_config, embeddings, chunks):
+    '''
+    Store Embeddings Step:
+        Enconding all chunks as dense embeddings representation and store them in a Vector Database.
+    '''
+    vectorstore = Neo4jVector.from_texts(
+            chunks,
+            url=vectorstore_config['url'],
+            username=vectorstore_config['username'],
+            password=vectorstore_config['password'],
+            embedding=embeddings,
+            node_label=f"MultipleFilesBotChunk_{reference_name}",
+            pre_delete_collection=True, # Delete existing data in collection
+        )
+    
+    return vectorstore
+
diff --git a/rag_utils/document_retrieval.py b/rag_utils/document_retrieval.py
new file mode 100644
index 000000000..9e89eb05f
--- /dev/null
+++ b/rag_utils/document_retrieval.py
@@ -0,0 +1,31 @@
+from langchain_core.prompts import ChatPromptTemplate
+from langchain.chains import create_retrieval_chain
+from langchain_core.output_parsers import StrOutputParser
+from langchain.chains.combine_documents import create_stuff_documents_chain
+import logging
+
+
+def build_agent(prompt, vectorstore, logger: logging.Logger, history_context="", llm=None):
+    if not llm:
+        logger.error("LLM is not available!")
+
+        return None
+    
+    if not prompt:
+        # st.session_state.prompts[document_name].get('latest')['prompt']
+        prompt = "You are a user assistant. Answer the questions using only the context provided." 
+
+    system_prompt = prompt + " Context: {context} " + history_context + " "
+
+    chat_prompt = ChatPromptTemplate(
+            [
+                ("system", system_prompt),
+                ("human", "{input}")
+            ]
+        )
+    
+    qa_chain = create_stuff_documents_chain(llm, chat_prompt)
+
+    agent_document_retrieval = create_retrieval_chain(vectorstore.as_retriever(), qa_chain)
+    
+    return agent_document_retrieval

From a89cf3821ecfcda43bd101a312fab497fd315a65 Mon Sep 17 00:00:00 2001
From: Leonardo Valeriano Neri <leonardovalerianoneri@gmail.com>
Date: Tue, 22 Apr 2025 22:33:30 -0300
Subject: [PATCH 10/24] feat: RAG package for multiple documents content
 retrieval.

---
 rag_utils/content_indexing.py      |  2 +-
 rag_utils/pipeline.py              | 71 ++++++++++++++++++++++++++++++
 rag_utils/qa_document_retrieval.py | 31 +++++++++++++
 3 files changed, 103 insertions(+), 1 deletion(-)
 create mode 100644 rag_utils/pipeline.py
 create mode 100644 rag_utils/qa_document_retrieval.py

diff --git a/rag_utils/content_indexing.py b/rag_utils/content_indexing.py
index 642fd6e06..e8a1ad91b 100644
--- a/rag_utils/content_indexing.py
+++ b/rag_utils/content_indexing.py
@@ -24,7 +24,7 @@ def document_encoder_retriever(
     ocr_lang = ocr_params.get('lang', None)
     
     if uploaded_file:
-        bytes_data = uploaded_file.read()
+        bytes_data = uploaded_file.getvalue()
         file_format = uploaded_file.name.split('.')[1].lower()
         
         # Extract text from document
diff --git a/rag_utils/pipeline.py b/rag_utils/pipeline.py
new file mode 100644
index 000000000..1d6ebd42e
--- /dev/null
+++ b/rag_utils/pipeline.py
@@ -0,0 +1,71 @@
+from rag_utils.content_indexing import document_encoder_retriever
+from rag_utils.qa_document_retrieval import build_agent
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+
+
+def RAG_document_retrieval(
+    document, 
+    file,
+    prompts, 
+    logger, 
+    embeddings, 
+    vectordb_config,
+    llm,
+    ocr_params={'pages': None, 'lang': 'por'}
+) -> str:
+    # Text extraction and embedding using OCR and LLM to build a QA RAG
+    document_retriever = document_encoder_retriever(
+        document_name=document, 
+        uploaded_file=file,
+        ocr_params=ocr_params, 
+        logger=logger, 
+        embeddings=embeddings,
+        vectorstore_config=vectordb_config
+    )
+
+    # prepare prompt with instructions
+    instructions = prompts[document].get('latest')['prompt']
+    agent = build_agent(
+        prompt=instructions, 
+        vectorstore=document_retriever, 
+        logger=logger, 
+        llm=llm
+    )
+
+    # QA RAG document retrieval
+    query = prompts[document].get('latest')['input']
+    answer = agent.invoke({'input': query})['answer']
+    
+    return answer
+
+
+def RAG_document_validator(document, document_answer, minuta_answer, llm):
+    
+    # Build context aggregating information from document and Minuta
+    context = f"Tabela {document} " + \
+        document_answer + "| Tabela Minuta" + \
+        minuta_answer
+
+    # Instructions of how to check if Minuta information matches document information
+    system_prompt = """ 
+    Você é um assistente que compara dados obtidos de diferentes documentos. 
+    O usuário fornecerá duas tabelas após o termo 'Contexto'.
+    Auxilie o usuário a checar se os dados nessas duas tabelas estão escritos da mesma forma. 
+    A comparação dos dados em comum precisa estar numa tabela. 
+    Dados que aparecem em apenas uma das tabelas fornecidas não precisa aparecer na tabela de comparação.
+    A comparação pode ignorar diferenças entre letras maiúsculas e minúsculas, e a presença de símbolos '.', '-', ou '/'.
+    A tabela de comparação precisa ter uma coluna 'Validação' que indica se os dados foram escritos de forma idêntica. 
+    """ + f" Contexto: {context} "
+    prompt = ChatPromptTemplate(
+            [
+                ("system", system_prompt),
+                ("human", "{input}")
+            ]
+        )
+    
+    chain = prompt | llm | StrOutputParser()
+    
+    # QA RAG document validation
+    answer = chain.invoke(f"Compare apenas os dados do {document} os quais também estejam presentes na Minuta.")
+    return answer
\ No newline at end of file
diff --git a/rag_utils/qa_document_retrieval.py b/rag_utils/qa_document_retrieval.py
new file mode 100644
index 000000000..9e89eb05f
--- /dev/null
+++ b/rag_utils/qa_document_retrieval.py
@@ -0,0 +1,31 @@
+from langchain_core.prompts import ChatPromptTemplate
+from langchain.chains import create_retrieval_chain
+from langchain_core.output_parsers import StrOutputParser
+from langchain.chains.combine_documents import create_stuff_documents_chain
+import logging
+
+
+def build_agent(prompt, vectorstore, logger: logging.Logger, history_context="", llm=None):
+    if not llm:
+        logger.error("LLM is not available!")
+
+        return None
+    
+    if not prompt:
+        # st.session_state.prompts[document_name].get('latest')['prompt']
+        prompt = "You are a user assistant. Answer the questions using only the context provided." 
+
+    system_prompt = prompt + " Context: {context} " + history_context + " "
+
+    chat_prompt = ChatPromptTemplate(
+            [
+                ("system", system_prompt),
+                ("human", "{input}")
+            ]
+        )
+    
+    qa_chain = create_stuff_documents_chain(llm, chat_prompt)
+
+    agent_document_retrieval = create_retrieval_chain(vectorstore.as_retriever(), qa_chain)
+    
+    return agent_document_retrieval

From 4b8e9920d84277b90e4d5cbfe94d698f859c0b8e Mon Sep 17 00:00:00 2001
From: Leonardo Valeriano Neri <leonardovalerianoneri@gmail.com>
Date: Tue, 22 Apr 2025 22:34:26 -0300
Subject: [PATCH 11/24] refactor:streamlit pages for minuta, parte compradora
 and parte vendedora.

---
 pages/1_Anexar_Minuta.py    | 136 +++++++++++++++++++++---------------
 pages/2_Parte_Compradora.py |  96 ++++++++++---------------
 pages/3_Parte_Vendedora.py  |  66 +++++++++++++++--
 3 files changed, 176 insertions(+), 122 deletions(-)

diff --git a/pages/1_Anexar_Minuta.py b/pages/1_Anexar_Minuta.py
index 74eca9a69..b9a48a775 100644
--- a/pages/1_Anexar_Minuta.py
+++ b/pages/1_Anexar_Minuta.py
@@ -2,8 +2,11 @@
 import logging
 from streamlit.logger import get_logger
 from rag_utils.config import init
-from rag_utils.content_indexing import document_encoder_retriever
-from rag_utils.document_retrieval import build_agent
+# from rag_utils.content_indexing import document_encoder_retriever
+# from rag_utils.qa_document_retrieval import build_agent
+from rag_utils.pipeline import RAG_document_retrieval
+from utils import StreamHandler
+
 
 logging.basicConfig(level = logging.INFO)
 
@@ -34,61 +37,80 @@
 
     if 'rag_minuta' not in st.session_state:
         st.write("A IA irá coletar as informações presentes no documento...")
-        st.session_state['rag_minuta'] = True
-
-        minuta_retriever = document_encoder_retriever(
-            document_name='Minuta', 
-            uploaded_file=uploaded_file_minuta,
-            ocr_params={
-                'pages': [0],
-                'lang': 'por'
-            }, 
-            logger=logger, 
-            embeddings=st.session_state.embeddings,
-            vectorstore_config=st.session_state.vectorstore_config
-        )
-
-        st.session_state.minuta_db = minuta_retriever
-
-        minuta_system = st.session_state.prompts['Minuta Comprador'].get('latest').get('prompt_minuta', None)
-        
-        minuta_agent = build_agent(
-            prompt=minuta_system, 
-            vectorstore=minuta_retriever,
-            logger=logger,
-            history_context="",
-            llm=st.session_state.llm
-        )
-
-        query = st.session_state.prompts['Minuta Comprador'].get('latest')['input_minuta']
-        logger.info(f"{query}")
-        
-        minuta_response = minuta_agent.invoke({'input': query })
-        
-        answer = minuta_response['answer']
+        st.session_state.rag_minuta = uploaded_file_minuta
+
+        # Collect and structure data from Buyers 
+        answer = RAG_document_retrieval(
+                    document='Minuta Comprador',
+                    file=st.session_state.rag_minuta,
+                    prompts=st.session_state.prompts,
+                    logger=logger,
+                    embeddings=st.session_state.embeddings,
+                    vectordb_config=st.session_state.vectorstore_config,
+                    llm=st.session_state.llm,
+                    ocr_params={
+                        'pages': [0],
+                        'lang': 'por'
+                    }
+                )
         
         st.session_state.minuta_comprador = answer
 
-        minuta_system_owner = st.session_state.prompts['Minuta Vendedor'].get('latest').get('prompt_minuta', None)
-        
-        minuta_agent_owner = build_agent(
-            prompt=minuta_system_owner, 
-            vectorstore=minuta_retriever,
-            logger=logger,
-            history_context="",
-            llm=st.session_state.llm
-        )
-
-        query_owner = st.session_state.prompts['Minuta Vendedor'].get('latest')['input_minuta']
-        
-        minuta_response_owner = minuta_agent_owner.invoke({'input': query_owner })
-        
-        answer_owner = minuta_response_owner['answer']
-        
-        st.session_state.minuta_vendedor = answer_owner
-
-if 'minuta_comprador' in st.session_state:
-    st.write(st.session_state.minuta_comprador)
-
-if 'minuta_vendedor' in st.session_state:
-    st.write(st.session_state.minuta_vendedor)
\ No newline at end of file
+        # Print output answer
+        stream_handler = StreamHandler(st.empty())
+        for token in st.session_state.minuta_comprador:
+            stream_handler.on_llm_new_token(token=token)
+
+        # Collect and structure data from Sellers 
+        answer = RAG_document_retrieval(
+                    document='Minuta Vendedor',
+                    file=st.session_state.rag_minuta,
+                    prompts=st.session_state.prompts,
+                    logger=logger,
+                    embeddings=st.session_state.embeddings,
+                    vectordb_config=st.session_state.vectorstore_config,
+                    llm=st.session_state.llm,
+                    ocr_params={
+                        'pages': [0],
+                        'lang': 'por'
+                    }
+                )
+
+        st.session_state.minuta_vendedor = answer
+
+        # Print output answer
+        stream_handler = StreamHandler(st.empty())
+        for token in st.session_state.minuta_vendedor:
+            stream_handler.on_llm_new_token(token=token)
+
+        # Collect and structure data from Real State/Land
+        answer = RAG_document_retrieval(
+                    document='Minuta Imóvel',
+                    file=st.session_state.rag_minuta,
+                    prompts=st.session_state.prompts,
+                    logger=logger,
+                    embeddings=st.session_state.embeddings,
+                    vectordb_config=st.session_state.vectorstore_config,
+                    llm=st.session_state.llm,
+                    ocr_params={
+                        'pages': [0,1],
+                        'lang': 'por'
+                    }
+                )
+
+        st.session_state.minuta_imovel = answer
+
+        # Print output answer
+        stream_handler = StreamHandler(st.empty())
+        for token in st.session_state.minuta_imovel:
+            stream_handler.on_llm_new_token(token=token)
+
+    else:
+        if 'minuta_comprador' in st.session_state:
+            st.write(st.session_state.minuta_comprador)
+
+        if 'minuta_vendedor' in st.session_state:
+            st.write(st.session_state.minuta_vendedor)
+
+        if 'minuta_imovel' in st.session_state:
+            st.write(st.session_state.minuta_imovel)
\ No newline at end of file
diff --git a/pages/2_Parte_Compradora.py b/pages/2_Parte_Compradora.py
index 82315c54c..a8e8bedbf 100644
--- a/pages/2_Parte_Compradora.py
+++ b/pages/2_Parte_Compradora.py
@@ -1,27 +1,17 @@
 import streamlit as st
 from streamlit.logger import get_logger
 import logging
-from langchain.callbacks.base import BaseCallbackHandler
+from utils import StreamHandler
 from rag_utils.config import init
 from rag_utils.content_indexing import document_encoder_retriever
-from rag_utils.document_retrieval import build_agent
+from rag_utils.qa_document_retrieval import build_agent
+from rag_utils.pipeline import RAG_document_retrieval, RAG_document_validator
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.output_parsers import StrOutputParser
 
 
 logging.basicConfig(level = logging.INFO)
 
-
-class StreamHandler(BaseCallbackHandler):
-    def __init__(self, container, initial_text=""):
-        self.container = container
-        self.text = initial_text
-
-    def on_llm_new_token(self, token: str, **kwargs) -> None:
-        self.text += token
-        self.container.markdown(self.text)
-
-
 if 'init' not in st.session_state:
     st.session_state.init = True
     init()
@@ -30,16 +20,27 @@ def on_llm_new_token(self, token: str, **kwargs) -> None:
     st.session_state.init_buyer_review_page = True
 
     st.session_state.buyer_documents_list = [
+        'CNH Comprador', 
+        'Comprovante de Residência Comprador', 
+        'Certidão de Casamento Comprador',
+        'Pacto Antenupcial ou Declaração de União Estável',
+        'CNH Cônjuge',
+        'Quitação ITBI'
+    ]
+    st.session_state.buyer_documents_list_tab = [
         'CNH', 
         'Comprovante de Residência', 
-        'Certidão de Casamento'
+        'Certidão de Casamento',
+        'Pacto Antenupcial ou Declaração de União Estável',
+        'CNH Cônjuge',
+        'Quitação ITBI'
     ]
     st.session_state.final_answer = dict().fromkeys(st.session_state.buyer_documents_list)
 
 logger = get_logger(__name__)
 
 # Define a list of Documents at app init() method
-tabs = st.tabs(st.session_state.buyer_documents_list)
+tabs = st.tabs(st.session_state.buyer_documents_list_tab)
 
 for tab, document in zip(tabs, st.session_state.buyer_documents_list):
     with tab:
@@ -54,58 +55,35 @@ def on_llm_new_token(self, token: str, **kwargs) -> None:
         if uploaded_file:
             st.write("A IA irá coletar e validar as informações presentes...")
 
-            # Text extraction and embedding using OCR and LLM to build a QA RAG
-            document_retriever = document_encoder_retriever(
-                document_name=document, 
-                uploaded_file=uploaded_file,
-                ocr_params={
-                    'pages': None, # All pages
-                    'lang': 'por'
-                }, 
-                logger=logger, 
-                embeddings=st.session_state.embeddings,
-                vectorstore_config=st.session_state.vectorstore_config
-            )
-
-            # prepare prompt with instructions
-            instructions = st.session_state.prompts[document].get('latest')['prompt']
-            agent = build_agent(
-                prompt=instructions, 
-                vectorstore=document_retriever, 
-                logger=logger, 
-                llm=st.session_state.llm
-            )
-
-            query = st.session_state.prompts[document].get('latest')['input']
-            answer = agent.invoke({'input': query})['answer']
+            # Collect and structure data from Buyers 
+            answer = RAG_document_retrieval(
+                    document=document,
+                    file=uploaded_file,
+                    prompts=st.session_state.prompts,
+                    logger=logger,
+                    embeddings=st.session_state.embeddings,
+                    vectordb_config=st.session_state.vectorstore_config,
+                    llm=st.session_state.llm,
+                    ocr_params={
+                        'pages': None,
+                        'lang': 'por'
+                    }
+                )
+        
             stream_handler = StreamHandler(st.empty())
             for token in answer:
                 stream_handler.on_llm_new_token(token=token)
 
             # Ask to LLM a table showing the Document data and Minuta data
             st.write(f"Validando de {document} com os dados da Minuta.")
-            
-            context = "Primeira tabela " + \
-                    answer + "| Segunda tabela " + \
-                    st.session_state.minuta_comprador
 
-            system_prompt = """ 
-            Você é um assistente que revisa documentos e precisa auxiliar o usuário que faz o trabalho manual 
-            de checar se dados que foram escritos na Minuta estão escritos da mesma forma que nos documentos de origem. 
-            O usuário fornecerá duas tabelas após o termo 'Contexto'.
-            Responda gerando uma tabela que compara apenas os dados dessas duas tabelas fornecidas.
-            Ignore diferenças de letras maiúsculas e minúsculas, ou que tenham símbolos '.', '-', ou '/'. 
-            """ + f" Contexto: {context} "
-            prompt = ChatPromptTemplate(
-                    [
-                        ("system", system_prompt),
-                        ("human", "{input}")
-                    ]
-                )
+            final_answer = RAG_document_validator(
+                document=document,
+                document_answer=answer,
+                minuta_answer=st.session_state.minuta_comprador,
+                llm=st.session_state.llm
+            )
             
-            chain = prompt | st.session_state.llm | StrOutputParser()
-
-            final_answer = chain.invoke("Compare apenas os dados do {document} os quais também estejam presentes na Minuta.")
             st.session_state.final_answer[document] = final_answer
 
             stream_handler = StreamHandler(st.empty())
diff --git a/pages/3_Parte_Vendedora.py b/pages/3_Parte_Vendedora.py
index b90aaadba..6aa72842e 100644
--- a/pages/3_Parte_Vendedora.py
+++ b/pages/3_Parte_Vendedora.py
@@ -1,12 +1,28 @@
 import streamlit as st
+from streamlit.logger import get_logger
+import logging
+from utils import StreamHandler
+from rag_utils.config import init
+from rag_utils.pipeline import RAG_document_retrieval, RAG_document_validator
+
+
+logging.basicConfig(level = logging.INFO)
+
+logger = get_logger(__name__)
+
+if 'init' not in st.session_state:
+    st.session_state.init = True
+    init()
 
 st.session_state.owner_documents_list = [
-    'CNH', 
-    'Comprovante de Residência', 
-    'Certidão de Casamento'
+    'CNH Vendedor', 
+    'Comprovante de Residência Vendedor',
+    'Matrícula do Imóvel'
 ]
 
-
+if 'init_owner_review_page' not in st.session_state:
+    st.session_state.init_owner_review_page = True
+    st.session_state.final_answer_owner = dict().fromkeys(st.session_state.owner_documents_list)
 
 # Define a list of Documents at app init() method
 tabs = st.tabs(st.session_state.owner_documents_list)
@@ -21,5 +37,43 @@
             key=document
         )
     
-    if uploaded_file:
-        st.write("A IA irá coletar e validar as informações presentes...")
\ No newline at end of file
+        if uploaded_file:
+            st.write("A IA irá coletar e validar as informações presentes...")
+
+            answer = RAG_document_retrieval(
+                        document=document,
+                        file=uploaded_file,
+                        prompts=st.session_state.prompts,
+                        logger=logger,
+                        embeddings=st.session_state.embeddings,
+                        vectordb_config=st.session_state.vectorstore_config,
+                        llm=st.session_state.llm
+                    )
+            # Print output answer
+            stream_handler = StreamHandler(st.empty())
+            for token in answer:
+                stream_handler.on_llm_new_token(token=token)
+
+            # Ask to LLM a table showing the Document data and Minuta data
+            st.write(f"Validando dados de {document} com os dados da Minuta.")
+
+            minuta_answer = st.session_state.minuta_vendedor 
+            if document == 'Matrícula do Imóvel':
+                minuta_answer = st.session_state.minuta_imovel
+            
+            final_answer = RAG_document_validator(
+                document=document,
+                document_answer=answer,
+                minuta_answer=minuta_answer,
+                llm=st.session_state.llm
+            )
+            st.session_state.final_answer_owner[document] = final_answer
+            
+            # Print output answer
+            stream_handler = StreamHandler(st.empty())
+            for token in final_answer:
+                stream_handler.on_llm_new_token(token=token)
+            
+        else:
+            if st.session_state.final_answer_owner[document]:
+                st.write(st.session_state.final_answer_owner[document])
\ No newline at end of file

From a91895ec50e9599971d651b1bacce0b981f50e14 Mon Sep 17 00:00:00 2001
From: Leonardo Valeriano Neri <leonardovalerianoneri@gmail.com>
Date: Tue, 22 Apr 2025 22:35:18 -0300
Subject: [PATCH 12/24] refactor:rename file.

---
 rag_utils/document_retrieval.py | 31 -------------------------------
 1 file changed, 31 deletions(-)
 delete mode 100644 rag_utils/document_retrieval.py

diff --git a/rag_utils/document_retrieval.py b/rag_utils/document_retrieval.py
deleted file mode 100644
index 9e89eb05f..000000000
--- a/rag_utils/document_retrieval.py
+++ /dev/null
@@ -1,31 +0,0 @@
-from langchain_core.prompts import ChatPromptTemplate
-from langchain.chains import create_retrieval_chain
-from langchain_core.output_parsers import StrOutputParser
-from langchain.chains.combine_documents import create_stuff_documents_chain
-import logging
-
-
-def build_agent(prompt, vectorstore, logger: logging.Logger, history_context="", llm=None):
-    if not llm:
-        logger.error("LLM is not available!")
-
-        return None
-    
-    if not prompt:
-        # st.session_state.prompts[document_name].get('latest')['prompt']
-        prompt = "You are a user assistant. Answer the questions using only the context provided." 
-
-    system_prompt = prompt + " Context: {context} " + history_context + " "
-
-    chat_prompt = ChatPromptTemplate(
-            [
-                ("system", system_prompt),
-                ("human", "{input}")
-            ]
-        )
-    
-    qa_chain = create_stuff_documents_chain(llm, chat_prompt)
-
-    agent_document_retrieval = create_retrieval_chain(vectorstore.as_retriever(), qa_chain)
-    
-    return agent_document_retrieval

From 6794c3e0dd6c4a2ad146c829cdabba84fbd3ce89 Mon Sep 17 00:00:00 2001
From: Leonardo Valeriano Neri <leonardovalerianoneri@gmail.com>
Date: Tue, 22 Apr 2025 22:36:12 -0300
Subject: [PATCH 13/24] refactor: prompt engineering and add Streaming Output
 tokens at utils.py.

---
 prompts.json | 104 ++++++++++++++++++++++++++++++++++++---------------
 utils.py     |  13 +++++++
 2 files changed, 86 insertions(+), 31 deletions(-)

diff --git a/prompts.json b/prompts.json
index a751cf982..77ec70809 100644
--- a/prompts.json
+++ b/prompts.json
@@ -1,58 +1,100 @@
 {
     "Minuta Comprador": {
         "latest": {
-            "prompt_minuta": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brasil). Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'Minuta'. Always simplify the answer returning a structured table with information summary.",
-            "input_minuta": "Extraia todos os dados pessoais (inclua nome completo) e de endereço da parte compradora. Esses dados estão escritos no parágrafo que contém o termo 'Outorgada Compradora'. Crie uma tabela com 2 colunas: Dado e Valor."
+            "prompt": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brasil). Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'Minuta Comprador'. Return all the text from the paragraph where the term 'Outorgada Compradora' exists and put tags <begin><end> to mark the text. The user will require data from this text, always return a structured table with that data.",
+            "input": "Extraia os dados da parte compradora e cônjuge referentes a: identificação e outros documentos pessoais apresentados, profissão, estado civil, regime de separação de bens (se casado ou união estável), cartório de notas e número de registro do Pacto Antenupcial ou União Estável (se declarado), e endereço de residência. Estruture esses dados em uma tabela com título 'Minuta Dados da Parte Compradora'."
         }
     },
-
     "Minuta Vendedor": {
         "latest": {
-            "prompt_minuta": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brasil). Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'Minuta'. Always simplify the answer returning a structured table with information summary.",
-            "input_minuta": "Extraia todos os dados pessoais da parte vendedora (inclua nome completo) escritos no parágrafo que contém o termo 'Outorgante Vendedor'. Crie uma tabela com 2 colunas: Dado e Valor."
+            "prompt": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brasil). Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'Minuta Vendedor'. Return all the text from the paragraph where the term 'Outorgante Vendedora' exists and put tags <begin><end> to mark the text. The user will require data from this text, always return a structured table with that data.",
+            "input": "Extraia os dados da parte vendedora e cônjuge referentes a: identificação e outros documentos pessoais apresentados, profissão, estado civil, regime de separação de bens (se casado ou união estável), cartório de notas e número de registro do Pacto Antenupcial ou União Estável (se declarado), e endereço de residência. Estruture esses dados em uma tabela com título 'Minuta Dados da Parte Vendedora'."
+
         }
     },
-
-    "CNH": {
+    "Minuta Imóvel": {
+        "latest": {
+            "prompt": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brasil). Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'Minuta Imóvel'. Return all the text from the paragraph where the term 'Cláusula 1a' exists and put tags <begin><end> to mark the text. The user will require data from this text, always return a structured table with that data.",
+            "input": "Extraia todas as informações acerca do imóvel: nome completo do proprietário, descrição do imóvel, logradouro, número, bairro e município, identificação de lote ou quadra, natureza do terreno (se pertence às forças armadas), características de cômodos, dimensões de tamanho e localização do imóvel, número de matrícula do imóvel e cartório que registrou a matrícula. Retorne uma tabela com título 'Dados do Imóvel' com essas informações."
+        }
+    },
+    "CNH Comprador": {
         "v1": "Extraia do documento CNH os dados nos campos: Nome completo, nacionalidade, data de nascimento, RG e órgão expedidor e CPF.",
-
         "v2": "Extraia do documento CNH os dados nos campos: Nome completo (1º Nome encontrado), nacionalidade, data de nascimento, RG com órgão expedidor e CPF (localizado após o RG).",
-
         "v3": "Extraia do documento CNH os dados nos campos: Nome completo (1º Nome encontrado), nacionalidade (faça inferência, se necessário), data de nascimento, RG com órgão expedidor e CPF (11 dígitos, localizado após o RG). Informe se a data de vencimento é maior que a data atual: 25/02/2025 (em caso negativo, retorne DOCUMENTO INVÁLIDO).",
-
         "v4": "Extraia do documento CNH os dados nos campos (alguns campos podem possuir escrita parecida com os dados a seguir, tente buscar campo com nome parecido): Nome completo (1º Nome encontrado), nacionalidade (faça inferência, se necessário), data de nascimento, RG com órgão expedidor e CPF (11 dígitos, localizado após o RG). Informe se a data de validade é maior que a data atual: 25/02/2025 (em caso negativo, retorne DOCUMENTO INVÁLIDO).",
-        
         "latest": {
-            "prompt": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brasil). Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'CNH'. Always return a structured table consolidating the information in the end of the answer.",
-            "input": "Extraia os dados nos campos: Nome completo (1º Nome encontrado), nacionalidade (faça inferência, se necessário), data de nascimento, RG com órgão expedidor e CPF (11 dígitos separados por '.' e '-', localizado após o RG). Informe se a data de validade é maior que a data atual: 25/02/2025 (em caso negativo, retorne DOCUMENTO INVÁLIDO)."
+            "prompt": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brasil). Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'CNH Comprador'. Always return a structured table consolidating the information in the end of the answer.",
+            "input": "Extraia os dados nos campos: Nome completo (1º Nome encontrado), nacionalidade (infira, se necessário e não declare ao usuário que foi inferido), data de nascimento, RG com órgão expedidor e CPF (11 dígitos separados por '.' e '-', localizado após o RG) remova '.', '-' e '/' dos valores. Retorne uma tabela com título 'Dados do CNH do Comprador' com essas informações."
         },
-    
         "resposta": "Nome Completo: MARLI SILVA DE ANDRADE; Nacionalidade: Brasileira (inferente do órgão emitente); Data de Nascimento: 19/08/1968; RG com Órgão Expedidor: 3198072 - SSP PE; CPF: Não localizado na informação fornecida; Validade do Documento: Até 29/04/2026. Como esta data está após 25/02/2025, o documento é válido."
     },
-
-    "Comprovante de Residência": {
+    "CNH Vendedor": {
+        "v1": "Extraia do documento CNH os dados nos campos: Nome completo, nacionalidade, data de nascimento, RG e órgão expedidor e CPF.",
+        "v2": "Extraia do documento CNH os dados nos campos: Nome completo (1º Nome encontrado), nacionalidade, data de nascimento, RG com órgão expedidor e CPF (localizado após o RG).",
+        "v3": "Extraia do documento CNH os dados nos campos: Nome completo (1º Nome encontrado), nacionalidade (faça inferência, se necessário), data de nascimento, RG com órgão expedidor e CPF (11 dígitos, localizado após o RG). Informe se a data de vencimento é maior que a data atual: 25/02/2025 (em caso negativo, retorne DOCUMENTO INVÁLIDO).",
+        "v4": "Extraia do documento CNH os dados nos campos (alguns campos podem possuir escrita parecida com os dados a seguir, tente buscar campo com nome parecido): Nome completo (1º Nome encontrado), nacionalidade (faça inferência, se necessário), data de nascimento, RG com órgão expedidor e CPF (11 dígitos, localizado após o RG). Informe se a data de validade é maior que a data atual: 25/02/2025 (em caso negativo, retorne DOCUMENTO INVÁLIDO).",
+        "latest": {
+            "prompt": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brasil). Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'CNH Vendedor'. Always return a structured table consolidating the information in the end of the answer.",
+            "input": "Extraia os dados nos campos: Nome completo (1º Nome encontrado), nacionalidade (infira, se necessário e não declare ao usuário que foi inferido), data de nascimento, RG com órgão expedidor e CPF (11 dígitos separados por '.' e '-', localizado após o RG) remova '.', '-' e '/' dos valores. Retorne uma tabela com título 'Dados do CNH do Vendedor' com essas informações."
+        }
+    },
+    "Quitação ITBI": {
+        "latest": {
+            "prompt": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brasil). Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'Quitação ITBI'. Always return a structured table consolidating the information in the end of the answer.",
+            "input": "Extraia a Inscrição e/ou sequencial do imóvel na Prefeitura onde está o imóvel, nome completo da pessoa no documento e valor financeiro presente. Retorne uma tabela com título 'Dados do Comprovante de ITBI' com essas informações."
+        }
+    },
+    "Matrícula do Imóvel": {
+        "latest": {
+            "prompt": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brasil). Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'Matrícula do Imóvel'. Always return a structured table consolidating the information in the end of the answer.",
+            "input": "Extraia todas as informações acerca do imóvel: nome completo do proprietário, descrição do imóvel, logradouro, número, bairro e município, identificação de lote ou quadra, natureza do terreno (se pertence às forças armadas), características de cômodos, dimensões de tamanho e localização do imóvel, número de matrícula do imóvel, cartório que registrou a matrícula, e Inscrição e/ou sequencial do imóvel na Prefeitura onde está o imóvel. Retorne uma tabela com título 'Dados do Imóvel Matrícula' com essas informações."
+        }
+    },
+    "Comprovante de Residência Comprador": {
         "v1": "Extraia do documento 'Comprovante de Residência' os dados relacionados a endereço e CEP (Caixa Postal/ZIP Code).",
-
         "latest": {
-            "prompt":"You are an assistant for question-answering tasks. Always answer in Portuguese (Brasil). Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'Comprovante de Residência'. Always return a structured table consolidating the information in the end of the answer.",
-            "input": "Extraia os dados relacionados a endereço e CEP (Caixa Postal/ZIP Code). Gere uma tabela as informações a seguir (por linha): endereço, cidade, estado, país. Verifique se a data de envio do documento possui até 30 dias de diferença da data atual: 25/02/2025 (em caso negativo, retorne DOCUMENTO INVÁLIDO)." 
-        },
-
-        "resposta": "Nome: WILSON PEREIRA DE LIMA; Endereço: Rua Setubal, 1245 - Apartamento 1402; CEP (Caixa Postal): 51130-010; Localização: Recife - PE."
+            "prompt": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brasil). Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'Comprovante de Residência Comprador'. Always return a structured table consolidating the information in the end of the answer.",
+            "input": "Extraia os dados relacionados a endereço e CEP (Caixa Postal/ZIP Code). Retorne uma tabela com título 'Dados do Comprovante de Residência Comprador' com essas informações."
+        }
     },
-
-    "Certidão de Casamento": {
+    "Comprovante de Residência Vendedor": {
+        "v1": "Extraia do documento 'Comprovante de Residência' os dados relacionados a endereço e CEP (Caixa Postal/ZIP Code).",
+        "latest": {
+            "prompt": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brasil). Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'Comprovante de Residência Vendedor'. Always return a structured table consolidating the information in the end of the answer.",
+            "input": "Extraia os dados relacionados a endereço e CEP (Caixa Postal/ZIP Code). Retorne uma tabela com título 'Dados do Comprovante de Residência Vendedor' com essas informações."
+        }
+    },
+    "Certidão de Casamento Comprador": {
         "v1": "Extraia do documento 'Certidão de Casamento' os dados relacionados ao Cônjuge: Nome, Documento de Identificação e Data do Casamento. Extrair o dado sobre o tipo de Regime de Bens.",
-
         "v2": "Extraia do documento 'Certidão de Casamento' os dados relacionados ao Cônjuge: Nome, Documento de Identificação e Data do Casamento. Extrair o dado sobre o tipo de Regime de Bens e extrair os dados de registro da certidão e onde a certidão foi emitida.",
-
         "v3": "Extraia do documento 'Certidão de Casamento' os dados relacionados ao Cônjuge: Nome, Documento de Identificação e Data do Casamento. Extrair o dado sobre o tipo de Regime de Bens. Extrair o número de registro da certidão, onde a certidão foi emitida e a data de emissão da Certidão.",
-
         "v4": "Extraia do documento 'Certidão de Casamento' os dados relacionados ao Cônjuge: Nome, Documento de Identificação e Data do Casamento. Extrair o dado sobre o tipo de Regime de Bens. Extrair o número de registro da certidão, onde a certidão foi emitida e a data de emissão da Certidão.",
-
         "latest": {
-            "prompt": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brazil). Always return a structured table gathering the information at the end. Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'Certidão de Casamento'.",
-            "input": "Extraia dados relacionados ao Cônjuge: Nome, Documento de Identificação e Data do Casamento. Extrair o dado sobre o tipo de Regime de Bens. Extrair o número de registro da certidão, onde a certidão foi emitida e a data de emissão da Certidão."
+            "prompt": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brazil). Always return a structured table gathering the information at the end. Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'Certidão de Casamento Comprador'.",
+            "input": "Extraia dados relacionados ao Cônjuge: Nome, Documento de Identificação e Data do Casamento. Extrair o dado sobre o tipo de Regime de Bens. Extrair o número de registro da certidão, onde a certidão foi emitida e a data de emissão da Certidão. Retorne uma tabela com título 'Dados da Certidão de Casamento Comprador' com essas informações."
+        }
+    },
+    "Certidão de Casamento Vendedor": {
+        "v1": "Extraia do documento 'Certidão de Casamento' os dados relacionados ao Cônjuge: Nome, Documento de Identificação e Data do Casamento. Extrair o dado sobre o tipo de Regime de Bens.",
+        "v2": "Extraia do documento 'Certidão de Casamento' os dados relacionados ao Cônjuge: Nome, Documento de Identificação e Data do Casamento. Extrair o dado sobre o tipo de Regime de Bens e extrair os dados de registro da certidão e onde a certidão foi emitida.",
+        "v3": "Extraia do documento 'Certidão de Casamento' os dados relacionados ao Cônjuge: Nome, Documento de Identificação e Data do Casamento. Extrair o dado sobre o tipo de Regime de Bens. Extrair o número de registro da certidão, onde a certidão foi emitida e a data de emissão da Certidão.",
+        "v4": "Extraia do documento 'Certidão de Casamento' os dados relacionados ao Cônjuge: Nome, Documento de Identificação e Data do Casamento. Extrair o dado sobre o tipo de Regime de Bens. Extrair o número de registro da certidão, onde a certidão foi emitida e a data de emissão da Certidão.",
+        "latest": {
+            "prompt": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brazil). Always return a structured table gathering the information at the end. Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'Certidão de Casamento Vendedor'.",
+            "input": "Extraia dados relacionados ao Cônjuge: Nome, Documento de Identificação e Data do Casamento. Extrair o dado sobre o tipo de Regime de Bens. Extrair o número de registro da certidão, onde a certidão foi emitida e a data de emissão da Certidão. Retorne uma tabela com título 'Dados da Certidão de Casamento Vendedor' com essas informações."
+        }
+    },
+    "CNH Cônjuge": {
+        "latest": {
+            "prompt": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brasil). Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'CNH Cônjuge'. Always return a structured table consolidating the information in the end of the answer.",
+            "input": "Extraia os dados nos campos: Nome completo (1º Nome encontrado), nacionalidade (infira, se necessário e não declare ao usuário que foi inferido), data de nascimento, RG com órgão expedidor e CPF (11 dígitos separados por '.' e '-', localizado após o RG) remova '.', '-' e '/' dos valores. Retorne uma tabela com título 'Dados do CNH do Cônjuge' com essas informações."
+        }
+    },
+    "Pacto Antenupcial ou Declaração de União Estável": {
+        "latest": {
+            "prompt": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brasil). Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'Pacto Antenupcial ou Declaração de União Estável'. Always return a structured table consolidating the information in the end of the answer.",
+            "input": "Extrair os dados sobre o tipo de Regime de Bens, número da Escritura, cartório onde foi lavrada, informações do livro e data. Retorne uma tabela com título 'Dados do Pacto/Declaração' com essas informações."
         }
     }
-}
+}
\ No newline at end of file
diff --git a/utils.py b/utils.py
index 23ad5b63d..15ecf6764 100644
--- a/utils.py
+++ b/utils.py
@@ -1,8 +1,21 @@
+from langchain.callbacks.base import BaseCallbackHandler
+
+
 class BaseLogger:
     def __init__(self) -> None:
         self.info = print
 
 
+class StreamHandler(BaseCallbackHandler):
+    def __init__(self, container, initial_text=""):
+        self.container = container
+        self.text = initial_text
+
+    def on_llm_new_token(self, token: str, **kwargs) -> None:
+        self.text += token
+        self.container.markdown(self.text)
+
+
 def extract_title_and_question(input_string):
     lines = input_string.strip().split("\n")
 

From 14e471a81f89698f4061b0be7dbe1cae00b38125 Mon Sep 17 00:00:00 2001
From: Leonardo Valeriano Neri <leonardovalerianoneri@gmail.com>
Date: Thu, 24 Apr 2025 11:17:16 -0300
Subject: [PATCH 14/24] Changing AI to OpenAI API

---
 StartLegal.py            | 155 +++++----------------------------------
 chains.py                |   7 +-
 pages/1_Anexar_Minuta.py |  24 +++---
 prompts.json             |   6 +-
 pull_model.clj           |   2 +-
 rag_utils/config.py      |   3 -
 6 files changed, 34 insertions(+), 163 deletions(-)

diff --git a/StartLegal.py b/StartLegal.py
index d5dc9a451..f1cb92284 100644
--- a/StartLegal.py
+++ b/StartLegal.py
@@ -1,154 +1,33 @@
-import os
-import json
-
 import streamlit as st
-from pdf2image import convert_from_bytes
-from PIL import Image
-import pytesseract
-from langchain.callbacks.base import BaseCallbackHandler
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_community.vectorstores import Neo4jVector
-from langchain_core.prompts import ChatPromptTemplate
-from langchain.chains import create_retrieval_chain
-from langchain_core.output_parsers import StrOutputParser
-from langchain.chains.combine_documents import create_stuff_documents_chain
 from streamlit.logger import get_logger
-from chains import (
-    load_embedding_model,
-    load_llm,
-)
-
-# load api key lib
-from dotenv import load_dotenv
 
 
 logger = get_logger(__name__)
 
+st.set_page_config(page_title="StartLegal", page_icon="📄")
 
-def init():
-    st.session_state.vectorstore_config = dict()
-    st.session_state.vectorstore_config['url'] = os.getenv("NEO4J_URI")
-    st.session_state.vectorstore_config['username'] = os.getenv("NEO4J_USERNAME")
-    st.session_state.vectorstore_config['password'] = os.getenv("NEO4J_PASSWORD")
-    
-    ollama_base_url = os.getenv("OLLAMA_BASE_URL")
-    embedding_model_name = os.getenv("EMBEDDING_MODEL")
-    llm_name = os.getenv("LLM")
-    # Remapping for Langchain Neo4j integration
-    os.environ["NEO4J_URL"] = st.session_state.vectorstore_config['url']
-
-    embeddings, dimension = load_embedding_model(
-        embedding_model_name, 
-        config={"ollama_base_url": ollama_base_url}, 
-        logger=logger
-    )
-    st.session_state.embeddings = embeddings
-    st.session_state.dimension = dimension
-
-    prompts = dict()
-    with open('prompts.json', 'rb') as f:
-        prompts = json.load(f)
-    
-    st.session_state.prompts = prompts
-    st.session_state.llm = load_llm(
-        llm_name, 
-        logger=logger, 
-        config={"ollama_base_url": ollama_base_url}
-    )
-
-    st.session_state.documents_list = ['CNH', 'Comprovante de Residência', 'Certidão de Casamento']
-    st.session_state.documents = []
-
-
-class StreamHandler(BaseCallbackHandler):
-    def __init__(self, container, initial_text=""):
-        self.container = container
-        self.text = initial_text
-
-    def on_llm_new_token(self, token: str, **kwargs) -> None:
-        self.text += token
-        self.container.markdown(self.text)
-
+st.title(body='📄 StartLegal')
+st.header("Módulo Revisor de Escrituras", divider='gray', )
 
-agents = dict()
-
-if 'init' not in st.session_state:
-    st.session_state.init = True
-    load_dotenv(".env")
-    init()
-
-st.set_page_config(page_title="StartLegal")
-
-st.header("StartLegal - Módulo Revisor 📄", divider='gray')
-
-st.subheader(
+st.write(
     "Anexe a minuta de uma escritura e em seguida os documentos necessários para revisão."
 )
 
-with st.sidebar:
-    st.title("Etapas de Revisão")
-#     st.text("Documentos apresentados:")
-#     for doc in st.session_state.documents:
-#         st.text(doc)
-
-#     st.text("Documentos apresentados...")
-
-# tabs = st.tabs(st.session_state.documents_list)
-
-# for tab, document in zip(tabs, st.session_state.documents_list):
-#     with tab:
-#         # upload a your files
-#         uploaded_file = st.file_uploader(
-#             "Suba o documento em algum desses formatos: PDF, png, jpeg, ou txt.", 
-#             accept_multiple_files=False,
-#             type=["png", "jpg", "jpeg", "pdf", "txt"],
-#             key=document
-#         )
-
-#         if uploaded_file:
-#             st.write("A IA irá coletar e validar as informações presentes...")
-
-#             # Text extraction and embedding using OCR and LLM to build a QA RAG
-#             query = st.session_state.prompts[document].get('latest')['input']
-#             agent = RAG_agent_document_validator(document, uploaded_file)
-#             answer = agent.invoke({'input': query})['answer']
-
-#             stream_handler = StreamHandler(st.empty())
-#             for token in answer:
-#                 stream_handler.on_llm_new_token(token=token)
-            
-#             # Visualize data from Minuta document
-#             st.write("Dados da Minuta (parte compradora)")
+doc_ = '''Siga os passos abaixo para revisar informações da Minuta:
+1. No menu à esquerda, clique em "Anexar Minuta" para inserir uma minuta no sistema e iniciar o processo de revisão.
+2. Em seguida clique em "Parte Compradora" e insira no sistema os documentos necessários em cada aba disponível (se necessário).
+    
+    2.1. Aguarde o sistema extrair as informações e realizar a comparação com a Minuta fornecida.
 
-#             stream_handler = StreamHandler(st.empty())
-#             for token in st.session_state.minuta_comprador:
-#                 stream_handler.on_llm_new_token(token=token)
+    2.2. Caso encontre alguma inconsistência, reportar o escrivão e finalizar o processo de revisão.
 
-#             # Ask to LLM a table showing the Document data and Minuta data
-#             st.write(f"Validando de {document} com os dados da Minuta.")
-            
-#             context = "Primeira tabela " + \
-#                     answer + "| Segunda tabela " + \
-#                     st.session_state.minuta_comprador
+3. Por último, clique em "Parte Vendedora" e insira os documentos solicitados.
 
-#             system_prompt = """ 
-#             Você é um assistente que revisa documentos e precisa auxiliar o usuário que faz o trabalho manual 
-#             de checar se dados que foram escritos na Minuta estão escritos da mesma forma que nos documentos de origem. 
-#             O usuário fornecerá duas tabelas após o termo 'Contexto'.
-#             Responda gerando uma tabela que compara apenas os dados dessas duas tabelas fornecidas.
-#             Ignore diferenças de letras maiúsculas e minúsculas, ou que tenham símbolos '.', '-', ou '/'. 
-#             """ + f" Contexto: {context} "
-#             prompt = ChatPromptTemplate(
-#                     [
-#                         ("system", system_prompt),
-#                         ("human", "{input}")
-#                     ]
-#                 )
-            
-#             chain = prompt | st.session_state.llm | StrOutputParser()
+    3.1. Aguarde o sistema extrair as informações e realizar a comparação com a Minuta fornecida.
 
-#             final_answer = chain.invoke("Compare apenas os dados do {document} os quais também estejam presentes na Minuta.")
+    3.2 Caso encontre alguma inconsistência, reportar o escrivão e finalizar o processo de revisão.
+'''
 
-#             stream_handler = StreamHandler(st.empty())
-#             for token in final_answer:
-#                 stream_handler.on_llm_new_token(token=token)
\ No newline at end of file
+st.markdown(
+    doc_
+)
\ No newline at end of file
diff --git a/chains.py b/chains.py
index 926ced7ee..316259bbf 100644
--- a/chains.py
+++ b/chains.py
@@ -63,12 +63,9 @@ def load_embedding_model(embedding_model_name: str, logger=BaseLogger(), config=
 
 
 def load_llm(llm_name: str, logger=BaseLogger(), config={}):
-    if llm_name in ["gpt-4", "gpt-4o", "gpt-4-turbo"]:
-        logger.info("LLM: Using GPT-4")
+    if llm_name.startswith("gpt"):
+        logger.info(f"LLM: Using OPENAI: {llm_name}")
         return ChatOpenAI(temperature=0, model_name=llm_name, streaming=True)
-    elif llm_name == "gpt-3.5":
-        logger.info("LLM: Using GPT-3.5")
-        return ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo", streaming=True)
     elif llm_name == "claudev2":
         logger.info("LLM: ClaudeV2")
         return ChatBedrock(
diff --git a/pages/1_Anexar_Minuta.py b/pages/1_Anexar_Minuta.py
index b9a48a775..484cf8772 100644
--- a/pages/1_Anexar_Minuta.py
+++ b/pages/1_Anexar_Minuta.py
@@ -2,8 +2,6 @@
 import logging
 from streamlit.logger import get_logger
 from rag_utils.config import init
-# from rag_utils.content_indexing import document_encoder_retriever
-# from rag_utils.qa_document_retrieval import build_agent
 from rag_utils.pipeline import RAG_document_retrieval
 from utils import StreamHandler
 
@@ -79,9 +77,9 @@
         st.session_state.minuta_vendedor = answer
 
         # Print output answer
-        stream_handler = StreamHandler(st.empty())
+        stream_handler2 = StreamHandler(st.empty())
         for token in st.session_state.minuta_vendedor:
-            stream_handler.on_llm_new_token(token=token)
+            stream_handler2.on_llm_new_token(token=token)
 
         # Collect and structure data from Real State/Land
         answer = RAG_document_retrieval(
@@ -101,16 +99,16 @@
         st.session_state.minuta_imovel = answer
 
         # Print output answer
-        stream_handler = StreamHandler(st.empty())
+        stream_handler3 = StreamHandler(st.empty())
         for token in st.session_state.minuta_imovel:
-            stream_handler.on_llm_new_token(token=token)
+            stream_handler3.on_llm_new_token(token=token)
 
-    else:
-        if 'minuta_comprador' in st.session_state:
-            st.write(st.session_state.minuta_comprador)
+else:
+    if 'minuta_comprador' in st.session_state:
+        st.write(st.session_state.minuta_comprador)
 
-        if 'minuta_vendedor' in st.session_state:
-            st.write(st.session_state.minuta_vendedor)
+    if 'minuta_vendedor' in st.session_state:
+        st.write(st.session_state.minuta_vendedor)
 
-        if 'minuta_imovel' in st.session_state:
-            st.write(st.session_state.minuta_imovel)
\ No newline at end of file
+    if 'minuta_imovel' in st.session_state:
+        st.write(st.session_state.minuta_imovel)
\ No newline at end of file
diff --git a/prompts.json b/prompts.json
index 77ec70809..072ac1c02 100644
--- a/prompts.json
+++ b/prompts.json
@@ -1,20 +1,20 @@
 {
     "Minuta Comprador": {
         "latest": {
-            "prompt": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brasil). Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'Minuta Comprador'. Return all the text from the paragraph where the term 'Outorgada Compradora' exists and put tags <begin><end> to mark the text. The user will require data from this text, always return a structured table with that data.",
+            "prompt": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brasil). Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'Minuta Comprador'. Return all the text from the paragraph where the term 'Outorgada Compradora' exists and show the text. The user will require data from this text, always return a structured table with that data.",
             "input": "Extraia os dados da parte compradora e cônjuge referentes a: identificação e outros documentos pessoais apresentados, profissão, estado civil, regime de separação de bens (se casado ou união estável), cartório de notas e número de registro do Pacto Antenupcial ou União Estável (se declarado), e endereço de residência. Estruture esses dados em uma tabela com título 'Minuta Dados da Parte Compradora'."
         }
     },
     "Minuta Vendedor": {
         "latest": {
-            "prompt": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brasil). Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'Minuta Vendedor'. Return all the text from the paragraph where the term 'Outorgante Vendedora' exists and put tags <begin><end> to mark the text. The user will require data from this text, always return a structured table with that data.",
+            "prompt": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brasil). Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'Minuta Vendedor'. Return all the text from the paragraph where the term 'Outorgante Vendedora' exists and show the text. The user will require data from this text, always return a structured table with that data.",
             "input": "Extraia os dados da parte vendedora e cônjuge referentes a: identificação e outros documentos pessoais apresentados, profissão, estado civil, regime de separação de bens (se casado ou união estável), cartório de notas e número de registro do Pacto Antenupcial ou União Estável (se declarado), e endereço de residência. Estruture esses dados em uma tabela com título 'Minuta Dados da Parte Vendedora'."
 
         }
     },
     "Minuta Imóvel": {
         "latest": {
-            "prompt": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brasil). Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'Minuta Imóvel'. Return all the text from the paragraph where the term 'Cláusula 1a' exists and put tags <begin><end> to mark the text. The user will require data from this text, always return a structured table with that data.",
+            "prompt": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brasil). Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'Minuta Imóvel'. Return all the text from the paragraph where the term 'Cláusula 1a' exists and show the text. The user will require data from this text, always return a structured table with that data.",
             "input": "Extraia todas as informações acerca do imóvel: nome completo do proprietário, descrição do imóvel, logradouro, número, bairro e município, identificação de lote ou quadra, natureza do terreno (se pertence às forças armadas), características de cômodos, dimensões de tamanho e localização do imóvel, número de matrícula do imóvel e cartório que registrou a matrícula. Retorne uma tabela com título 'Dados do Imóvel' com essas informações."
         }
     },
diff --git a/pull_model.clj b/pull_model.clj
index 99bffc6dd..bc3a87483 100644
--- a/pull_model.clj
+++ b/pull_model.clj
@@ -8,7 +8,7 @@
     (println (format "pulling ollama model %s using %s" llm url))
     (if (and llm
          url
-         (not (#{"gpt-4" "gpt-3.5" "claudev2" "gpt-4o" "gpt-4-turbo"} llm))
+         (not (#{"gpt-4" "gpt-3.5" "claudev2" "gpt-4o" "gpt-4-turbo" "gpt-4.1-nano"} llm))
          (not (some #(.startsWith llm %) ["ai21.jamba-instruct-v1:0"
                                           "amazon.titan"
                                           "anthropic.claude"
diff --git a/rag_utils/config.py b/rag_utils/config.py
index 6942f971f..327797aa3 100644
--- a/rag_utils/config.py
+++ b/rag_utils/config.py
@@ -47,6 +47,3 @@ def init():
         logger=logger, 
         config={"ollama_base_url": ollama_base_url}
     )
-
-    st.session_state.documents_list = ['CNH', 'Comprovante de Residência', 'Certidão de Casamento']
-    st.session_state.documents = []

From 43fac6b7d15f408c6415b04d15a1ac1d592e2dae Mon Sep 17 00:00:00 2001
From: Leonardo Valeriano Neri <leonardovalerianoneri@gmail.com>
Date: Thu, 24 Apr 2025 16:46:10 -0300
Subject: [PATCH 15/24] feat:output text used as Context

---
 prompts.json | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/prompts.json b/prompts.json
index 072ac1c02..55b4d0ca9 100644
--- a/prompts.json
+++ b/prompts.json
@@ -1,20 +1,20 @@
 {
     "Minuta Comprador": {
         "latest": {
-            "prompt": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brasil). Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'Minuta Comprador'. Return all the text from the paragraph where the term 'Outorgada Compradora' exists and show the text. The user will require data from this text, always return a structured table with that data.",
+            "prompt": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brasil). Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'Minuta Comprador'. Return all the text from the paragraph where the term 'Outorgada Compradora' exists and put it at the begining of the answer as a quote string . The user will require data from this text, always return a structured table with that data.",
             "input": "Extraia os dados da parte compradora e cônjuge referentes a: identificação e outros documentos pessoais apresentados, profissão, estado civil, regime de separação de bens (se casado ou união estável), cartório de notas e número de registro do Pacto Antenupcial ou União Estável (se declarado), e endereço de residência. Estruture esses dados em uma tabela com título 'Minuta Dados da Parte Compradora'."
         }
     },
     "Minuta Vendedor": {
         "latest": {
-            "prompt": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brasil). Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'Minuta Vendedor'. Return all the text from the paragraph where the term 'Outorgante Vendedora' exists and show the text. The user will require data from this text, always return a structured table with that data.",
+            "prompt": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brasil). Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'Minuta Vendedor'. Return all the text from the paragraph where the term 'Outorgante Vendedora' exists and put it at the begining of the answer as a quote string . The user will require data from this text, always return a structured table with that data.",
             "input": "Extraia os dados da parte vendedora e cônjuge referentes a: identificação e outros documentos pessoais apresentados, profissão, estado civil, regime de separação de bens (se casado ou união estável), cartório de notas e número de registro do Pacto Antenupcial ou União Estável (se declarado), e endereço de residência. Estruture esses dados em uma tabela com título 'Minuta Dados da Parte Vendedora'."
 
         }
     },
     "Minuta Imóvel": {
         "latest": {
-            "prompt": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brasil). Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'Minuta Imóvel'. Return all the text from the paragraph where the term 'Cláusula 1a' exists and show the text. The user will require data from this text, always return a structured table with that data.",
+            "prompt": "You are an assistant for question-answering tasks. Always answer in Portuguese (Brasil). Retrieve information from Context where the token 'NOME_DO_DOCUMENTO' contains 'Minuta Imóvel'. Return all the text from the paragraph where the term 'Cláusula 1a' exists and put it at the begining of the answer as a quote string . The user will require data from this text, always return a structured table with that data.",
             "input": "Extraia todas as informações acerca do imóvel: nome completo do proprietário, descrição do imóvel, logradouro, número, bairro e município, identificação de lote ou quadra, natureza do terreno (se pertence às forças armadas), características de cômodos, dimensões de tamanho e localização do imóvel, número de matrícula do imóvel e cartório que registrou a matrícula. Retorne uma tabela com título 'Dados do Imóvel' com essas informações."
         }
     },

From a0c1e498907fde56d2ebe21b0fa37dbf76aa029b Mon Sep 17 00:00:00 2001
From: Leonardo Valeriano Neri <leonardovalerianoneri@gmail.com>
Date: Thu, 24 Apr 2025 16:47:54 -0300
Subject: [PATCH 16/24] feat:add monitoring logs about tokens

---
 chains.py                   |  2 +-
 pages/2_Parte_Compradora.py |  3 ++-
 pages/3_Parte_Vendedora.py  |  3 ++-
 rag_utils/pipeline.py       | 24 ++++++++++++++++++++----
 4 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/chains.py b/chains.py
index 316259bbf..f84eca120 100644
--- a/chains.py
+++ b/chains.py
@@ -65,7 +65,7 @@ def load_embedding_model(embedding_model_name: str, logger=BaseLogger(), config=
 def load_llm(llm_name: str, logger=BaseLogger(), config={}):
     if llm_name.startswith("gpt"):
         logger.info(f"LLM: Using OPENAI: {llm_name}")
-        return ChatOpenAI(temperature=0, model_name=llm_name, streaming=True)
+        return ChatOpenAI(temperature=0, model_name=llm_name)
     elif llm_name == "claudev2":
         logger.info("LLM: ClaudeV2")
         return ChatBedrock(
diff --git a/pages/2_Parte_Compradora.py b/pages/2_Parte_Compradora.py
index a8e8bedbf..af7baaa6d 100644
--- a/pages/2_Parte_Compradora.py
+++ b/pages/2_Parte_Compradora.py
@@ -81,7 +81,8 @@
                 document=document,
                 document_answer=answer,
                 minuta_answer=st.session_state.minuta_comprador,
-                llm=st.session_state.llm
+                llm=st.session_state.llm,
+                logger=logger
             )
             
             st.session_state.final_answer[document] = final_answer
diff --git a/pages/3_Parte_Vendedora.py b/pages/3_Parte_Vendedora.py
index 6aa72842e..cbfedb957 100644
--- a/pages/3_Parte_Vendedora.py
+++ b/pages/3_Parte_Vendedora.py
@@ -65,7 +65,8 @@
                 document=document,
                 document_answer=answer,
                 minuta_answer=minuta_answer,
-                llm=st.session_state.llm
+                llm=st.session_state.llm,
+                logger=logger
             )
             st.session_state.final_answer_owner[document] = final_answer
             
diff --git a/rag_utils/pipeline.py b/rag_utils/pipeline.py
index 1d6ebd42e..26b27aacf 100644
--- a/rag_utils/pipeline.py
+++ b/rag_utils/pipeline.py
@@ -2,13 +2,15 @@
 from rag_utils.qa_document_retrieval import build_agent
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.output_parsers import StrOutputParser
+from langchain_community.callbacks import get_openai_callback
+import logging
 
 
 def RAG_document_retrieval(
     document, 
     file,
     prompts, 
-    logger, 
+    logger: logging.Logger, 
     embeddings, 
     vectordb_config,
     llm,
@@ -35,12 +37,19 @@ def RAG_document_retrieval(
 
     # QA RAG document retrieval
     query = prompts[document].get('latest')['input']
-    answer = agent.invoke({'input': query})['answer']
+
+    with get_openai_callback() as cb:
+        answer = agent.invoke({'input': query})['answer']
+
+    logger.info(f"Total Tokens: {cb.total_tokens}")
+    logger.info(f"Prompt Tokens: {cb.prompt_tokens}")
+    logger.info(f"Completion Tokens: {cb.completion_tokens}")
+    logger.info(f"Total Cost (USD): ${cb.total_cost}")
     
     return answer
 
 
-def RAG_document_validator(document, document_answer, minuta_answer, llm):
+def RAG_document_validator(document, document_answer, minuta_answer, llm, logger: logging.Logger):
     
     # Build context aggregating information from document and Minuta
     context = f"Tabela {document} " + \
@@ -67,5 +76,12 @@ def RAG_document_validator(document, document_answer, minuta_answer, llm):
     chain = prompt | llm | StrOutputParser()
     
     # QA RAG document validation
-    answer = chain.invoke(f"Compare apenas os dados do {document} os quais também estejam presentes na Minuta.")
+    with get_openai_callback() as cb:
+        answer = chain.invoke(f"Compare apenas os dados do {document} os quais também estejam presentes na Minuta.")
+        
+    logger.info(f"Total Tokens: {cb.total_tokens}")
+    logger.info(f"Prompt Tokens: {cb.prompt_tokens}")
+    logger.info(f"Completion Tokens: {cb.completion_tokens}")
+    logger.info(f"Total Cost (USD): ${cb.total_cost}")
+
     return answer
\ No newline at end of file

From e9b04ef587242c63c54be34cad41e67ea7807806 Mon Sep 17 00:00:00 2001
From: Leonardo Valeriano Neri <leonardovalerianoneri@gmail.com>
Date: Thu, 24 Apr 2025 16:48:33 -0300
Subject: [PATCH 17/24] refactor: allow more versions from OpenAI models

---
 pages/1_Anexar_Minuta.py | 2 --
 pull_model.clj           | 3 ++-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/pages/1_Anexar_Minuta.py b/pages/1_Anexar_Minuta.py
index 484cf8772..57b268a16 100644
--- a/pages/1_Anexar_Minuta.py
+++ b/pages/1_Anexar_Minuta.py
@@ -16,8 +16,6 @@
     
 st.set_page_config(page_title="StartLegal - Anexar a Minuta")
 
-st.sidebar.header("Dados obtidos da Minuta")
-
 st.subheader(
     "Anexe a minuta da escritura para iniciar a revisão.",
     divider='gray'
diff --git a/pull_model.clj b/pull_model.clj
index bc3a87483..ed9d3b0be 100644
--- a/pull_model.clj
+++ b/pull_model.clj
@@ -8,7 +8,8 @@
     (println (format "pulling ollama model %s using %s" llm url))
     (if (and llm
          url
-         (not (#{"gpt-4" "gpt-3.5" "claudev2" "gpt-4o" "gpt-4-turbo" "gpt-4.1-nano"} llm))
+         (not (some #(.startsWith llm %) ["gpt" 
+                                          "claudev2"]))
          (not (some #(.startsWith llm %) ["ai21.jamba-instruct-v1:0"
                                           "amazon.titan"
                                           "anthropic.claude"

From ab4cd41f05e242b8a20bb2930125a39182b7de44 Mon Sep 17 00:00:00 2001
From: Leonardo Valeriano Neri <leonardovalerianoneri@gmail.com>
Date: Fri, 25 Apr 2025 19:23:58 -0300
Subject: [PATCH 18/24] feature:add document viewer at all pages

---
 pages/1_Anexar_Minuta.py    | 57 +++++++++++++++++-------
 pages/2_Parte_Compradora.py | 89 ++++++++++++++++++++++---------------
 pages/3_Parte_Vendedora.py  | 86 ++++++++++++++++++++++-------------
 requirements.txt            |  3 +-
 4 files changed, 150 insertions(+), 85 deletions(-)

diff --git a/pages/1_Anexar_Minuta.py b/pages/1_Anexar_Minuta.py
index 57b268a16..e2b76225d 100644
--- a/pages/1_Anexar_Minuta.py
+++ b/pages/1_Anexar_Minuta.py
@@ -4,6 +4,7 @@
 from rag_utils.config import init
 from rag_utils.pipeline import RAG_document_retrieval
 from utils import StreamHandler
+import base64
 
 
 logging.basicConfig(level = logging.INFO)
@@ -14,7 +15,9 @@
     st.session_state.init = True
     init()
     
-st.set_page_config(page_title="StartLegal - Anexar a Minuta")
+st.set_page_config(page_title="StartLegal - Anexar a Minuta", layout="wide")
+
+
 
 st.subheader(
     "Anexe a minuta da escritura para iniciar a revisão.",
@@ -25,20 +28,29 @@
 uploaded_file_minuta = st.file_uploader(
     "Suba o documento da Minuta em formato PDF.", 
     accept_multiple_files=False,
-    type="pdf",
-    key='minuta'
+    type="pdf"
 )
 
 if uploaded_file_minuta:
+    st.write("A IA irá coletar as informações presentes no documento...")
+    
+    col1, col2 = st.columns(2, vertical_alignment="center")
+
+    with col2:
+        base64_pdf = base64.b64encode(uploaded_file_minuta.getvalue()).decode("utf-8")
+        pdf_display = (
+            f'<embed src="data:application/pdf;base64,{base64_pdf}" '
+            'width="960" height="2160" type="application/pdf"></embed>'
+        )
+        
+        st.markdown(pdf_display, unsafe_allow_html=True)
+        st.session_state.minuta_file = uploaded_file_minuta
 
-    if 'rag_minuta' not in st.session_state:
-        st.write("A IA irá coletar as informações presentes no documento...")
-        st.session_state.rag_minuta = uploaded_file_minuta
-
+    with col1:
         # Collect and structure data from Buyers 
         answer = RAG_document_retrieval(
                     document='Minuta Comprador',
-                    file=st.session_state.rag_minuta,
+                    file=uploaded_file_minuta,
                     prompts=st.session_state.prompts,
                     logger=logger,
                     embeddings=st.session_state.embeddings,
@@ -60,7 +72,7 @@
         # Collect and structure data from Sellers 
         answer = RAG_document_retrieval(
                     document='Minuta Vendedor',
-                    file=st.session_state.rag_minuta,
+                    file=uploaded_file_minuta,
                     prompts=st.session_state.prompts,
                     logger=logger,
                     embeddings=st.session_state.embeddings,
@@ -82,7 +94,7 @@
         # Collect and structure data from Real State/Land
         answer = RAG_document_retrieval(
                     document='Minuta Imóvel',
-                    file=st.session_state.rag_minuta,
+                    file=uploaded_file_minuta,
                     prompts=st.session_state.prompts,
                     logger=logger,
                     embeddings=st.session_state.embeddings,
@@ -102,11 +114,24 @@
             stream_handler3.on_llm_new_token(token=token)
 
 else:
-    if 'minuta_comprador' in st.session_state:
-        st.write(st.session_state.minuta_comprador)
+    if 'minuta_file' in st.session_state:
+        col3, col4 = st.columns(2, vertical_alignment="center")
+
+        with col4:
+            base64_pdf = base64.b64encode(st.session_state.minuta_file.getvalue()).decode("utf-8")
+            pdf_display = (
+                f'<embed src="data:application/pdf;base64,{base64_pdf}" '
+                'width="960" height="2160" type="application/pdf"></embed>'
+            )
+            
+            st.markdown(pdf_display, unsafe_allow_html=True)
+        
+        with col3:
+            if 'minuta_comprador' in st.session_state:
+                st.write(st.session_state.minuta_comprador)
 
-    if 'minuta_vendedor' in st.session_state:
-        st.write(st.session_state.minuta_vendedor)
+            if 'minuta_vendedor' in st.session_state:
+                st.write(st.session_state.minuta_vendedor)
 
-    if 'minuta_imovel' in st.session_state:
-        st.write(st.session_state.minuta_imovel)
\ No newline at end of file
+            if 'minuta_imovel' in st.session_state:
+                st.write(st.session_state.minuta_imovel)
\ No newline at end of file
diff --git a/pages/2_Parte_Compradora.py b/pages/2_Parte_Compradora.py
index af7baaa6d..b86919166 100644
--- a/pages/2_Parte_Compradora.py
+++ b/pages/2_Parte_Compradora.py
@@ -3,11 +3,8 @@
 import logging
 from utils import StreamHandler
 from rag_utils.config import init
-from rag_utils.content_indexing import document_encoder_retriever
-from rag_utils.qa_document_retrieval import build_agent
 from rag_utils.pipeline import RAG_document_retrieval, RAG_document_validator
-from langchain_core.prompts import ChatPromptTemplate
-from langchain_core.output_parsers import StrOutputParser
+import base64
 
 
 logging.basicConfig(level = logging.INFO)
@@ -55,41 +52,63 @@
         if uploaded_file:
             st.write("A IA irá coletar e validar as informações presentes...")
 
-            # Collect and structure data from Buyers 
-            answer = RAG_document_retrieval(
-                    document=document,
-                    file=uploaded_file,
-                    prompts=st.session_state.prompts,
-                    logger=logger,
-                    embeddings=st.session_state.embeddings,
-                    vectordb_config=st.session_state.vectorstore_config,
-                    llm=st.session_state.llm,
-                    ocr_params={
-                        'pages': None,
-                        'lang': 'por'
-                    }
-                )
-        
-            stream_handler = StreamHandler(st.empty())
-            for token in answer:
-                stream_handler.on_llm_new_token(token=token)
+            col1, col2, col3 = st.columns(3, vertical_alignment="center")
 
-            # Ask to LLM a table showing the Document data and Minuta data
-            st.write(f"Validando de {document} com os dados da Minuta.")
+            with col1:
+                base64_pdf = base64.b64encode(uploaded_file.getvalue()).decode("utf-8")
+                pdf_display = (
+                    f'<embed src="data:application/pdf;base64,{base64_pdf}" '
+                    'width="640" height="1080" type="application/pdf"></embed>'
+                )
+                
+                st.markdown(pdf_display, unsafe_allow_html=True)
+            
+            with col3:
+                base64_pdf = base64.b64encode(st.session_state.minuta_file.getvalue()).decode("utf-8")
+                pdf_display = (
+                    f'<embed src="data:application/pdf;base64,{base64_pdf}" '
+                    'width="640" height="1080" type="application/pdf"></embed>'
+                )
+                
+                st.markdown(pdf_display, unsafe_allow_html=True)
 
-            final_answer = RAG_document_validator(
-                document=document,
-                document_answer=answer,
-                minuta_answer=st.session_state.minuta_comprador,
-                llm=st.session_state.llm,
-                logger=logger
-            )
+            with col2:
+                # Collect and structure data from Buyers 
+                answer = RAG_document_retrieval(
+                        document=document,
+                        file=uploaded_file,
+                        prompts=st.session_state.prompts,
+                        logger=logger,
+                        embeddings=st.session_state.embeddings,
+                        vectordb_config=st.session_state.vectorstore_config,
+                        llm=st.session_state.llm,
+                        ocr_params={
+                            'pages': None,
+                            'lang': 'por'
+                        }
+                    )
             
-            st.session_state.final_answer[document] = final_answer
+                stream_handler = StreamHandler(st.empty())
+                for token in answer:
+                    stream_handler.on_llm_new_token(token=token)
 
-            stream_handler = StreamHandler(st.empty())
-            for token in final_answer:
-                stream_handler.on_llm_new_token(token=token)
+                # Ask to LLM a table showing the Document data and Minuta data
+                st.write(f"Validando de {document} com os dados da Minuta.")
+
+                final_answer = RAG_document_validator(
+                    document=document,
+                    document_answer=answer,
+                    minuta_answer=st.session_state.minuta_comprador,
+                    llm=st.session_state.llm,
+                    logger=logger
+                )
+                
+                st.session_state.final_answer[document] = final_answer
+
+                stream_handler = StreamHandler(st.empty())
+                for token in final_answer:
+                    stream_handler.on_llm_new_token(token=token)
+        
         else:
             if st.session_state.final_answer[document]:
                 st.write(st.session_state.final_answer[document])
\ No newline at end of file
diff --git a/pages/3_Parte_Vendedora.py b/pages/3_Parte_Vendedora.py
index cbfedb957..acf236938 100644
--- a/pages/3_Parte_Vendedora.py
+++ b/pages/3_Parte_Vendedora.py
@@ -4,6 +4,7 @@
 from utils import StreamHandler
 from rag_utils.config import init
 from rag_utils.pipeline import RAG_document_retrieval, RAG_document_validator
+import base64
 
 
 logging.basicConfig(level = logging.INFO)
@@ -40,40 +41,61 @@
         if uploaded_file:
             st.write("A IA irá coletar e validar as informações presentes...")
 
-            answer = RAG_document_retrieval(
-                        document=document,
-                        file=uploaded_file,
-                        prompts=st.session_state.prompts,
-                        logger=logger,
-                        embeddings=st.session_state.embeddings,
-                        vectordb_config=st.session_state.vectorstore_config,
-                        llm=st.session_state.llm
-                    )
-            # Print output answer
-            stream_handler = StreamHandler(st.empty())
-            for token in answer:
-                stream_handler.on_llm_new_token(token=token)
+            col1, col2, col3 = st.columns(3, vertical_alignment="center")
 
-            # Ask to LLM a table showing the Document data and Minuta data
-            st.write(f"Validando dados de {document} com os dados da Minuta.")
-
-            minuta_answer = st.session_state.minuta_vendedor 
-            if document == 'Matrícula do Imóvel':
-                minuta_answer = st.session_state.minuta_imovel
-            
-            final_answer = RAG_document_validator(
-                document=document,
-                document_answer=answer,
-                minuta_answer=minuta_answer,
-                llm=st.session_state.llm,
-                logger=logger
-            )
-            st.session_state.final_answer_owner[document] = final_answer
+            with col1:
+                base64_pdf = base64.b64encode(uploaded_file.getvalue()).decode("utf-8")
+                pdf_display = (
+                    f'<embed src="data:application/pdf;base64,{base64_pdf}" '
+                    'width="640" height="1080" type="application/pdf"></embed>'
+                )
+                
+                st.markdown(pdf_display, unsafe_allow_html=True)
             
-            # Print output answer
-            stream_handler = StreamHandler(st.empty())
-            for token in final_answer:
-                stream_handler.on_llm_new_token(token=token)
+            with col3:
+                base64_pdf = base64.b64encode(st.session_state.minuta_file.getvalue()).decode("utf-8")
+                pdf_display = (
+                    f'<embed src="data:application/pdf;base64,{base64_pdf}" '
+                    'width="640" height="1080" type="application/pdf"></embed>'
+                )
+                
+                st.markdown(pdf_display, unsafe_allow_html=True)
+
+            with col2:
+                answer = RAG_document_retrieval(
+                            document=document,
+                            file=uploaded_file,
+                            prompts=st.session_state.prompts,
+                            logger=logger,
+                            embeddings=st.session_state.embeddings,
+                            vectordb_config=st.session_state.vectorstore_config,
+                            llm=st.session_state.llm
+                        )
+                # Print output answer
+                stream_handler = StreamHandler(st.empty())
+                for token in answer:
+                    stream_handler.on_llm_new_token(token=token)
+
+                # Ask to LLM a table showing the Document data and Minuta data
+                st.write(f"Validando dados de {document} com os dados da Minuta.")
+
+                minuta_answer = st.session_state.minuta_vendedor 
+                if document == 'Matrícula do Imóvel':
+                    minuta_answer = st.session_state.minuta_imovel
+                
+                final_answer = RAG_document_validator(
+                    document=document,
+                    document_answer=answer,
+                    minuta_answer=minuta_answer,
+                    llm=st.session_state.llm,
+                    logger=logger
+                )
+                st.session_state.final_answer_owner[document] = final_answer
+                
+                # Print output answer
+                stream_handler = StreamHandler(st.empty())
+                for token in final_answer:
+                    stream_handler.on_llm_new_token(token=token)
             
         else:
             if st.session_state.final_answer_owner[document]:
diff --git a/requirements.txt b/requirements.txt
index db6a1e032..02aca6e66 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@ python-dotenv
 wikipedia
 tiktoken
 neo4j
-streamlit
+streamlit==1.44.0
 Pillow
 fastapi
 pdf2image==1.17.0
@@ -12,7 +12,6 @@ pydantic
 uvicorn
 sse-starlette
 boto3
-streamlit==1.32.1
 # missing from the langchain base image?
 langchain-openai==0.2.4
 langchain-community==0.3.3

From b577f41f9daf08755638955b56658439268f062b Mon Sep 17 00:00:00 2001
From: Leonardo Valeriano Neri <leonardovalerianoneri@gmail.com>
Date: Tue, 20 May 2025 10:06:57 -0300
Subject: [PATCH 19/24] feat:changing layout to top.

---
 pages/2_Parte_Compradora.py | 2 +-
 pages/3_Parte_Vendedora.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pages/2_Parte_Compradora.py b/pages/2_Parte_Compradora.py
index b86919166..7487378d3 100644
--- a/pages/2_Parte_Compradora.py
+++ b/pages/2_Parte_Compradora.py
@@ -52,7 +52,7 @@
         if uploaded_file:
             st.write("A IA irá coletar e validar as informações presentes...")
 
-            col1, col2, col3 = st.columns(3, vertical_alignment="center")
+            col1, col2, col3 = st.columns(3, vertical_alignment="top")
 
             with col1:
                 base64_pdf = base64.b64encode(uploaded_file.getvalue()).decode("utf-8")
diff --git a/pages/3_Parte_Vendedora.py b/pages/3_Parte_Vendedora.py
index acf236938..eb4ebab50 100644
--- a/pages/3_Parte_Vendedora.py
+++ b/pages/3_Parte_Vendedora.py
@@ -41,7 +41,7 @@
         if uploaded_file:
             st.write("A IA irá coletar e validar as informações presentes...")
 
-            col1, col2, col3 = st.columns(3, vertical_alignment="center")
+            col1, col2, col3 = st.columns(3, vertical_alignment="top")
 
             with col1:
                 base64_pdf = base64.b64encode(uploaded_file.getvalue()).decode("utf-8")

From 85e7def3a9de340085b7e88243589f768517ad07 Mon Sep 17 00:00:00 2001
From: Leonardo Valeriano Neri <leonardovalerianoneri@gmail.com>
Date: Tue, 17 Jun 2025 15:37:03 -0300
Subject: [PATCH 20/24] Adding page for new feature. Organizing home page to
 route multiple pages for 2 app features.

---
 Escrita_de_Minuta.py          |  8 ++++++++
 Revisor_de_Minuta.py          | 27 +++++++++++++++++++++++++
 StartLegal.py                 | 37 ++++++++++++-----------------------
 multiple_files_bot.Dockerfile |  2 ++
 pages/1_Anexar_Minuta.py      |  4 ----
 5 files changed, 50 insertions(+), 28 deletions(-)
 create mode 100644 Escrita_de_Minuta.py
 create mode 100644 Revisor_de_Minuta.py

diff --git a/Escrita_de_Minuta.py b/Escrita_de_Minuta.py
new file mode 100644
index 000000000..d1b081f9f
--- /dev/null
+++ b/Escrita_de_Minuta.py
@@ -0,0 +1,8 @@
+import streamlit as st
+
+st.title(body='✍️ StartLegal - Escritor de Minutas')
+st.header("Assistente de Elaboração de Escrituras", divider='gray', )
+
+st.write(
+    "Anexe os documentos necessários das partes compradora e vendedora e a escritura do imóvel."
+)
diff --git a/Revisor_de_Minuta.py b/Revisor_de_Minuta.py
new file mode 100644
index 000000000..65369e8de
--- /dev/null
+++ b/Revisor_de_Minuta.py
@@ -0,0 +1,27 @@
+import streamlit as st
+
+st.title(body='📄 StartLegal - Revisor de Minutas')
+st.header("Revisão de Minuta de Escrituras", divider='gray', )
+
+st.write(
+    "Anexe a minuta de uma escritura e em seguida os documentos necessários para revisão."
+)
+
+doc_ = '''Siga os passos abaixo para revisar informações da Minuta:
+1. No menu à esquerda, clique em "Anexar Minuta" para inserir uma minuta no sistema e iniciar o processo de revisão.
+2. Em seguida clique em "Parte Compradora" e insira no sistema os documentos necessários em cada aba disponível (se necessário).
+    
+    2.1. Aguarde o sistema extrair as informações e realizar a comparação com a Minuta fornecida.
+
+    2.2. Caso encontre alguma inconsistência, reportar o escrivão e finalizar o processo de revisão.
+
+3. Por último, clique em "Parte Vendedora" e insira os documentos solicitados.
+
+    3.1. Aguarde o sistema extrair as informações e realizar a comparação com a Minuta fornecida.
+
+    3.2 Caso encontre alguma inconsistência, reportar o escrivão e finalizar o processo de revisão.
+'''
+
+st.markdown(
+    doc_
+)
\ No newline at end of file
diff --git a/StartLegal.py b/StartLegal.py
index f1cb92284..34293c4bd 100644
--- a/StartLegal.py
+++ b/StartLegal.py
@@ -2,32 +2,21 @@
 from streamlit.logger import get_logger
 
 
+st.set_page_config(page_title="StartLegal - IA para Cartórios", page_icon="📄")
+
 logger = get_logger(__name__)
 
-st.set_page_config(page_title="StartLegal", page_icon="📄")
+escritor_page = st.Page("Escrita_de_Minuta.py", title="Escrita de Minuta", icon="✍️")
 
-st.title(body='📄 StartLegal')
-st.header("Módulo Revisor de Escrituras", divider='gray', )
+revisor_page = st.Page("Revisor_de_Minuta.py", title="Guia de Usabilidade", icon="📄")
+upload_minuta_page = st.Page("pages/1_Anexar_Minuta.py", title="Minuta", icon="📄")
+parte_compradora_page = st.Page("pages/2_Parte_Compradora.py", title="Parte Compradora", icon="📄")
+parte_vendedora_page = st.Page("pages/3_Parte_Vendedora.py", title="Parte Vendedora", icon="📄")
 
-st.write(
-    "Anexe a minuta de uma escritura e em seguida os documentos necessários para revisão."
+pg = st.navigation(
+    {
+        "Escrita de Minutas": [escritor_page],
+        "Revisão de Minutas": [revisor_page, upload_minuta_page, parte_compradora_page, parte_vendedora_page],
+    }
 )
-
-doc_ = '''Siga os passos abaixo para revisar informações da Minuta:
-1. No menu à esquerda, clique em "Anexar Minuta" para inserir uma minuta no sistema e iniciar o processo de revisão.
-2. Em seguida clique em "Parte Compradora" e insira no sistema os documentos necessários em cada aba disponível (se necessário).
-    
-    2.1. Aguarde o sistema extrair as informações e realizar a comparação com a Minuta fornecida.
-
-    2.2. Caso encontre alguma inconsistência, reportar o escrivão e finalizar o processo de revisão.
-
-3. Por último, clique em "Parte Vendedora" e insira os documentos solicitados.
-
-    3.1. Aguarde o sistema extrair as informações e realizar a comparação com a Minuta fornecida.
-
-    3.2 Caso encontre alguma inconsistência, reportar o escrivão e finalizar o processo de revisão.
-'''
-
-st.markdown(
-    doc_
-)
\ No newline at end of file
+pg.run()
\ No newline at end of file
diff --git a/multiple_files_bot.Dockerfile b/multiple_files_bot.Dockerfile
index 360ea3f7e..041e367a1 100644
--- a/multiple_files_bot.Dockerfile
+++ b/multiple_files_bot.Dockerfile
@@ -22,6 +22,8 @@ ADD rag_utils rag_utils
 ADD pages pages
 COPY __init__.py .
 COPY StartLegal.py .
+COPY Revisor_de_Minuta.py .
+COPY Escrita_de_Minuta.py .
 COPY prompts.json .
 COPY utils.py .
 COPY chains.py .
diff --git a/pages/1_Anexar_Minuta.py b/pages/1_Anexar_Minuta.py
index e2b76225d..fdc5fbdea 100644
--- a/pages/1_Anexar_Minuta.py
+++ b/pages/1_Anexar_Minuta.py
@@ -15,10 +15,6 @@
     st.session_state.init = True
     init()
     
-st.set_page_config(page_title="StartLegal - Anexar a Minuta", layout="wide")
-
-
-
 st.subheader(
     "Anexe a minuta da escritura para iniciar a revisão.",
     divider='gray'

From 6a3b029f744f11579e223d288433631eb252d94b Mon Sep 17 00:00:00 2001
From: Leonardo Valeriano Neri <leonardovalerianoneri@gmail.com>
Date: Tue, 17 Jun 2025 16:02:13 -0300
Subject: [PATCH 21/24] Feature layout.

---
 Escrita_de_Minuta.py | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/Escrita_de_Minuta.py b/Escrita_de_Minuta.py
index d1b081f9f..2fdc71504 100644
--- a/Escrita_de_Minuta.py
+++ b/Escrita_de_Minuta.py
@@ -6,3 +6,37 @@
 st.write(
     "Anexe os documentos necessários das partes compradora e vendedora e a escritura do imóvel."
 )
+
+parte_compradora = st.container()
+
+parte_compradora.markdown("**Parte Compradora**")
+parte_compradora.file_uploader(
+    "Anexe os documentos da parte compradora",
+    type=["pdf", "jpg", "jpeg", "png"],
+    key="parte_compradora",
+    accept_multiple_files=True
+)
+
+st.divider()
+
+parte_vendedora = st.container()
+
+parte_vendedora.markdown("**Parte Vendedora**")
+parte_vendedora.file_uploader(
+    "Anexe os documentos da parte vendedora",
+    type=["pdf", "jpg", "jpeg", "png"],
+    key="parte_vendedora",
+    accept_multiple_files=True
+)
+
+st.divider()
+
+imovel = st.container()
+
+imovel.markdown("**Escritura do Imóvel**")
+imovel.file_uploader(
+    "Anexe a escritura do imóvel",
+    type=["pdf", "jpg", "jpeg", "png"], 
+    key="imovel",
+    accept_multiple_files=True
+)
\ No newline at end of file

From 9003a7a7b3d71f7d18161b8260f75e00f2f7e1a9 Mon Sep 17 00:00:00 2001
From: Leonardo Valeriano Neri <leonardovalerianoneri@gmail.com>
Date: Tue, 17 Jun 2025 16:03:07 -0300
Subject: [PATCH 22/24] Setting up Wide layout configuration.

---
 StartLegal.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/StartLegal.py b/StartLegal.py
index 34293c4bd..8879df430 100644
--- a/StartLegal.py
+++ b/StartLegal.py
@@ -2,7 +2,7 @@
 from streamlit.logger import get_logger
 
 
-st.set_page_config(page_title="StartLegal - IA para Cartórios", page_icon="📄")
+st.set_page_config(page_title="StartLegal - IA para Cartórios", page_icon="🤖", layout="wide")
 
 logger = get_logger(__name__)
 

From bf2821f9d766fc81cb848ffe76e2448dae57c3b9 Mon Sep 17 00:00:00 2001
From: Leonardo Valeriano Neri <leonardovalerianoneri@gmail.com>
Date: Wed, 18 Jun 2025 17:20:10 -0300
Subject: [PATCH 23/24] feat:Threading processing and progress bar
 synchronization.

---
 Escrita_de_Minuta.py | 163 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 144 insertions(+), 19 deletions(-)

diff --git a/Escrita_de_Minuta.py b/Escrita_de_Minuta.py
index 2fdc71504..77d967c9e 100644
--- a/Escrita_de_Minuta.py
+++ b/Escrita_de_Minuta.py
@@ -1,4 +1,117 @@
 import streamlit as st
+from rag_utils.config import init
+from rag_utils.pipeline import RAG_document_retrieval
+import base64
+import threading
+import logging
+import time
+
+session_state_status_percent = 0
+
+
+def parte_compradora_agents_thread(uploaded_files):
+    if uploaded_files:
+        st.session_state.status = "Processando documentos da parte compradora..."
+        global session_state_status_percent
+        session_state_status_percent = 0
+        len_uploaded_files = len(uploaded_files)
+        logging.info("Parte compradora: Iniciando o processamento dos documentos.")
+        
+        # Simulate processing each uploaded file
+        for p, uploaded_file in enumerate(uploaded_files):
+            # Simulate processing time
+            time.sleep(1)
+            st.session_state.status = f"Processando {uploaded_file.name}..."
+            session_state_status_percent = (session_state_status_percent+p+1) / len_uploaded_files
+            logging.info(f"Parte compradora: Processando {uploaded_file.name}...")
+            logging.info(f"Parte compradora (Thread): Progresso {session_state_status_percent:.2%}")
+
+            # Here you would typically call your RAG_document_retrieval function
+            # For example: RAG_document_retrieval(uploaded_file)
+        
+        st.session_state.status = "Documentos da parte compradora processados com sucesso!"
+        logging.info("Parte compradora: Documentos processados com sucesso!")
+
+
+def parte_compradora_button_callback(uploaded_files, container):
+    global session_state_status_percent
+    
+    thread = threading.Thread(
+        target=parte_compradora_agents_thread,
+        args=(uploaded_files,),
+        daemon=True
+    )
+    thread.start()
+    
+    with container:
+        bar = st.progress(0, text_ocr)
+        while session_state_status_percent*100 < 100:
+            time.sleep(0.1)
+            bar.progress(session_state_status_percent, text_ocr)
+            logging.info(f"Parte compradora: Progresso {session_state_status_percent:.2%}")
+        bar.empty()
+        thread.join()
+    st.session_state.status = "Processamento finalizado!"
+    logging.info("Parte compradora: Processamento finalizado!")
+
+
+def parte_vendedora_button_callback():
+    pass
+
+
+def imovel_button_callback():
+    pass
+
+
+def container_files_uploader_and_text_writer(container, labels: dict, key, callback):
+    container.markdown(f"**{labels['markdown_label']}**")
+    
+    uploaded_files = container.file_uploader(
+        labels['file_uploader_label'],
+        type=["pdf", "jpg", "jpeg", "png"],
+        key=f"{key}_file_uploader",
+        accept_multiple_files=True
+    )
+    
+    write_text_button = container.button(
+        labels['button_label'],
+        help="Clique para gerar o parágrafo com as informações extraídas dos documentos.",
+        disabled=not uploaded_files,
+        on_click=callback,
+        args=(uploaded_files, container),
+        key=f"{key}_button"
+    )
+    
+    if uploaded_files and write_text_button:
+        container.write(f"Status: {st.session_state.status}")
+
+logging.basicConfig(level = logging.INFO)
+
+if 'init' not in st.session_state:
+    st.session_state.init = True
+    if 'status' not in st.session_state:
+        st.session_state.status = "Aguardando o upload dos documentos..."
+    init()
+
+if 'init_writer_page' not in st.session_state:
+    st.session_state.init_buyer_writer_page = True
+
+    st.session_state.buyer_documents_list = [
+        'CNH Comprador', 
+        'Comprovante de Residência Comprador', 
+        'Certidão de Casamento Comprador',
+        'Pacto Antenupcial ou Declaração de União Estável',
+        'CNH Cônjuge',
+        'Quitação ITBI'
+    ]
+    
+    st.session_state.owner_documents_list = [
+        'CNH Vendedor',
+        'Comprovante de Residência Vendedor',
+        'Matrícula do Imóvel'
+    ]
+
+text_ocr = "Extraindo informações dos documentos..."
 
 st.title(body='✍️ StartLegal - Escritor de Minutas')
 st.header("Assistente de Elaboração de Escrituras", divider='gray', )
@@ -9,34 +122,46 @@
 
 parte_compradora = st.container()
 
-parte_compradora.markdown("**Parte Compradora**")
-parte_compradora.file_uploader(
-    "Anexe os documentos da parte compradora",
-    type=["pdf", "jpg", "jpeg", "png"],
-    key="parte_compradora",
-    accept_multiple_files=True
+container_files_uploader_and_text_writer(
+    container=parte_compradora,
+    labels={
+        'markdown_label': '**Parte Compradora**',
+        'file_uploader_label': 'Anexe os documentos da parte compradora',
+        'button_label': 'Gerar Parágrafo',
+        'progress_text': text_ocr
+    },
+    key='parte_compradora',
+    callback=parte_compradora_button_callback
 )
 
 st.divider()
 
 parte_vendedora = st.container()
 
-parte_vendedora.markdown("**Parte Vendedora**")
-parte_vendedora.file_uploader(
-    "Anexe os documentos da parte vendedora",
-    type=["pdf", "jpg", "jpeg", "png"],
-    key="parte_vendedora",
-    accept_multiple_files=True
+container_files_uploader_and_text_writer(
+    container=parte_vendedora,
+    labels={
+        'markdown_label': '**Parte Vendedora**',
+        'file_uploader_label': 'Anexe os documentos da parte vendedora',
+        'button_label': 'Gerar Parágrafo',
+        'progress_text': text_ocr
+    },
+    key='parte_vendedora',
+    callback=parte_vendedora_button_callback
 )
 
 st.divider()
 
 imovel = st.container()
 
-imovel.markdown("**Escritura do Imóvel**")
-imovel.file_uploader(
-    "Anexe a escritura do imóvel",
-    type=["pdf", "jpg", "jpeg", "png"], 
-    key="imovel",
-    accept_multiple_files=True
-)
\ No newline at end of file
+container_files_uploader_and_text_writer(
+    container=imovel,
+    labels={
+        'markdown_label': '**Escritura do Imóvel**',
+        'file_uploader_label': 'Anexe a escritura do imóvel',
+        'button_label': 'Gerar Parágrafo',
+        'progress_text': text_ocr
+    },
+    key='imovel',
+    callback=imovel_button_callback
+)

From cf90b9588bfa96b5609e37e131cdf85f73f6e4c6 Mon Sep 17 00:00:00 2001
From: Leonardo Valeriano Neri <leonardovalerianoneri@gmail.com>
Date: Wed, 18 Jun 2025 18:58:17 -0300
Subject: [PATCH 24/24] fix: counter percentage instead of cumulative counter.
 reset counter percentage in the end of thread execution.

---
 Escrita_de_Minuta.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/Escrita_de_Minuta.py b/Escrita_de_Minuta.py
index 77d967c9e..06dbe3060 100644
--- a/Escrita_de_Minuta.py
+++ b/Escrita_de_Minuta.py
@@ -6,23 +6,26 @@
 import logging
 import time
 
+
 session_state_status_percent = 0
 
 
 def parte_compradora_agents_thread(uploaded_files):
+    global session_state_status_percent
+
     if uploaded_files:
         st.session_state.status = "Processando documentos da parte compradora..."
-        global session_state_status_percent
+        logging.info("Parte compradora: Iniciando o processamento dos documentos.")
+        
         session_state_status_percent = 0
         len_uploaded_files = len(uploaded_files)
-        logging.info("Parte compradora: Iniciando o processamento dos documentos.")
         
         # Simulate processing each uploaded file
         for p, uploaded_file in enumerate(uploaded_files):
             # Simulate processing time
             time.sleep(1)
             st.session_state.status = f"Processando {uploaded_file.name}..."
-            session_state_status_percent = (session_state_status_percent+p+1) / len_uploaded_files
+            session_state_status_percent = (p+1) / len_uploaded_files
             logging.info(f"Parte compradora: Processando {uploaded_file.name}...")
             logging.info(f"Parte compradora (Thread): Progresso {session_state_status_percent:.2%}")
 
@@ -51,6 +54,8 @@ def parte_compradora_button_callback(uploaded_files, container):
             logging.info(f"Parte compradora: Progresso {session_state_status_percent:.2%}")
         bar.empty()
         thread.join()
+
+    session_state_status_percent = 0
     st.session_state.status = "Processamento finalizado!"
     logging.info("Parte compradora: Processamento finalizado!")