llm-scripts/paperless.py at main · Jay4242/llm-scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
#!/usr/bin/env python3

import os
import json
import requests
from dotenv import load_dotenv
from openai import OpenAI
import httpx

load_dotenv()

class PaperlessAPI:
    def __init__(self, base_url, api_token):
        self.base_url = base_url
        self.api_token = api_token
        self.headers = {
            'Authorization': f'Token {self.api_token}',
            'Content-Type': 'application/json',
        }

    def llm(self, document, system, preprompt, postprompt, temp):
        """
        Send a document to the LLM server and return the response.
        """
        llm_base_url = os.environ.get("LLM_BASE_URL")
        client = OpenAI(base_url=llm_base_url, api_key="none", timeout=httpx.Timeout(3600))

        completion = client.chat.completions.create(
            model="llama-3.2-3b-it-q8_0",
            messages=[
                {"role": "system", "content": system},
                {"role": "user", "content": preprompt},
                {"role": "user", "content": document},
                {"role": "user", "content": postprompt}
            ],
            temperature=temp,
            stream=True,
        )

        response = ""
        for chunk in completion:
            if chunk.choices[0].delta.content:
                response += chunk.choices[0].delta.content

        return response

    def get(self, endpoint, params=None):
        """
        Generic GET request.
        """
        url = f"{self.base_url}/{endpoint}/"
        try:
            response = requests.get(url, headers=self.headers, params=params)
            response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f"Error during GET request: {e}")
            return None

    def post(self, endpoint, data):
        """
        Generic POST request.
        """
        url = f"{self.base_url}/{endpoint}/"
        try:
            response = requests.post(url, headers=self.headers, data=json.dumps(data))
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f"Error during POST request: {e}")
            return None

    def patch(self, endpoint, data):
        """
        Generic PATCH request.
        """
        url = f"{self.base_url}/{endpoint}/"
        try:
            response = requests.patch(url, headers=self.headers, data=json.dumps(data))
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f"Error during PATCH request: {e}")
            return None

    def delete(self, endpoint):
        """
        Generic DELETE request.
        """
        url = f"{self.base_url}/{endpoint}/"
        try:
            response = requests.delete(url, headers=self.headers)
            response.raise_for_status()
            return response.status_code
        except requests.exceptions.RequestException as e:
            print(f"Error during DELETE request: {e}")
            return None

    # Add specific API methods below.  Examples:
    def get_documents(self, page=1):
        """
        Retrieve a list of documents.
        """
        return self.get("documents", params={'page': page})

    def get_document(self, document_id):
        """
        Retrieve a specific document by ID.
        """
        return self.get(f"documents/{document_id}")

    def create_document(self, data):
        """
        Create a new document. Data should be a dictionary.
        """
        return self.post("documents", data)

    def search_documents(self, query, page=1):
        """
        Search for documents.
        """
        return self.get("documents", params={'query': query, 'page': page})

    def get_full_document(self, document_id):
        """
        Retrieve the full information for a specific document by ID.
        """
        return self.get(f"documents/{document_id}")


def main():
    # Load configuration from environment variables or a config file.
    base_url = os.environ.get("PAPERLESS_BASE_URL")
    api_token = os.environ.get("PAPERLESS_API_TOKEN")

    if not base_url or not api_token:
        print("Error: PAPERLESS_BASE_URL and PAPERLESS_API_TOKEN environment variables must be set.")
        return

    api = PaperlessAPI(base_url, api_token)

    # Example usage:
    documents = api.get_documents()
    if documents:
        print(f"Found {documents['count']} documents.")
        for document in documents['results']:
            print(f"  - {document['title']} (ID: {document['id']})")

    # Example of getting a specific document (replace 123 with a valid ID):
    # document = api.get_document(123)
    # if document:
    #     print(f"Document content: {document['content']}")

    # Example of searching for documents:
    query = input("Enter search query: ")
    search_results = api.search_documents(query, 1)

    if search_results and 'count' in search_results and search_results['count'] > 0:
        total_documents = search_results['count']
        page_size = len(search_results['results'])  # Assuming page_size is the number of results on the current page
        total_pages = (total_documents + page_size - 1) // page_size
        print(f"Found {total_documents} documents matching '{query}'. Total pages: {total_pages}")

        if 'results' in search_results:
            for i, document in enumerate(search_results['results']):
                print(f"  {i+1}. {document['title']} (ID: {document['id']})")

            while True:
                try:
                    selection = input(f"Enter the number of the document to view (1-{len(search_results['results'])}), or 0 to exit: ")
                    selection = int(selection)

                    if selection == 0:
                        break
                    elif 1 <= selection <= len(search_results['results']):
                        selected_document = search_results['results'][selection - 1]
                        document_id = selected_document['id']
                        full_document = api.get_full_document(document_id)
                        if full_document:
                            print(f"--- Document {document_id} ---")
                            print(f"Title: {full_document['title']}")
                            print(f"Content: {full_document['content']}")

                            # Get document details
                            document_title = full_document['title']
                            correspondent_id = full_document['correspondent']
                            correspondent = None
                            if correspondent_id:
                                correspondent = api.get(f"correspondents/{correspondent_id}")
                            correspondent_name = correspondent['name'] if correspondent else "Unknown"

                            # Get LLM prompts
                            system_prompt = "You are a helpful assistant."
                            pre_prompt = f"The following is a document titled {document_title}, from {correspondent_name}:"
                            post_prompt = "Summarize the document in three sentences or less."
                            temperature = 0.7

                            # Call the LLM function
                            llm_response = api.llm(full_document['content'], system_prompt, pre_prompt, post_prompt, temperature)
                            print(f"Summary: {llm_response}")

                            return  # Exit after displaying the document and summary
                        else:
                            print(f"Failed to retrieve document {document_id}.")
                    else:
                        print("Invalid selection. Please try again.")
                except ValueError:
                    print("Invalid input. Please enter a number.")
    else:
        print(f"No documents found matching '{query}'.")


if __name__ == "__main__":
    main()