| | import logging |
| | import os |
| | import docx |
| | import PyPDF2 |
| | from docx.shared import RGBColor, Pt |
| | from io import BytesIO, IOBase |
| | import tempfile |
| | import re |
| | import datetime |
| | import torch |
| |
|
| | import gradio as gr |
| | from transformers import AutoModelForCausalLM, AutoTokenizer |
| | import huggingface_hub |
| |
|
| | |
| | |
| | |
| | logging.basicConfig( |
| | level=logging.INFO, |
| | format="%(asctime)s [%(levelname)s] %(name)s - %(message)s" |
| | ) |
| | logger = logging.getLogger("LLM-Legal-App") |
| |
|
| | |
| | |
| | |
| | def initialize_model(): |
| | """Initialize the phi-2 model and tokenizer from HuggingFace.""" |
| | logger.info("Initializing phi-2 model and tokenizer...") |
| | try: |
| | |
| | |
| | |
| | model_name = "microsoft/phi-2" |
| | tokenizer = AutoTokenizer.from_pretrained(model_name) |
| | model = AutoModelForCausalLM.from_pretrained( |
| | model_name, |
| | torch_dtype=torch.float16, |
| | device_map="auto", |
| | trust_remote_code=True |
| | ) |
| | logger.info("Successfully initialized phi-2 model and tokenizer.") |
| | return model, tokenizer |
| | except Exception as e: |
| | logger.exception("Error initializing Hugging Face model.") |
| | raise ValueError(f"Failed to initialize model: {e}") |
| |
|
| | |
| | model, tokenizer = initialize_model() |
| |
|
| | |
| | |
| | |
| | def generate_with_model(prompt, max_length=1400, temperature=0.3): |
| | """Generate text using the Hugging Face model.""" |
| | logger.info("Generating text with phi-2 model.") |
| | |
| | try: |
| | inputs = tokenizer(prompt, return_tensors="pt").to(model.device) |
| | |
| | |
| | generation_config = { |
| | "max_new_tokens": max_length, |
| | "temperature": temperature, |
| | "top_p": 0.9, |
| | "do_sample": temperature > 0, |
| | "pad_token_id": tokenizer.eos_token_id |
| | } |
| | |
| | with torch.no_grad(): |
| | outputs = model.generate(**inputs, **generation_config) |
| | |
| | response = tokenizer.decode(outputs[0], skip_special_tokens=True) |
| | |
| | |
| | if response.startswith(prompt): |
| | response = response[len(prompt):].strip() |
| | |
| | logger.info("Text generation complete.") |
| | return response |
| | |
| | except Exception as e: |
| | logger.exception("Error during text generation.") |
| | return f"Error generating text: {e}" |
| |
|
| | def generate_legal_document(doc_type, party_a, party_b, context, country): |
| | """ |
| | Uses DocumentCogito to generate a legal document. Returns the document text. |
| | """ |
| | logger.info(f"Starting generation for doc_type={doc_type!r}.") |
| | |
| | party_a = party_a if party_a else "[Party A Not Provided]" |
| | party_b = party_b if party_b else "[Party B Not Provided]" |
| | context = context if context else "[Context Not Provided]" |
| |
|
| | prompt = f""" |
| | You are a helpful legal assistant. Generate a {doc_type} for: |
| | 1) {party_a} |
| | 2) {party_b} |
| | |
| | Context/brief of the agreement: |
| | {context}. |
| | |
| | The document should include: |
| | - Purpose of the {doc_type} |
| | - Responsibilities and obligations of each party |
| | - Confidentiality terms |
| | - Payment terms (use [To Be Determined] if not specified) |
| | - Term (duration) and termination |
| | - Governing law: {country} |
| | - Jurisdiction: [Appropriate region in {country} if not provided] |
| | - Signature blocks |
| | |
| | Use formal language, but keep it relatively clear and readable. |
| | For any missing information, use placeholders like [To Be Determined]. |
| | Include a disclaimer that this is a draft and not legally binding until reviewed and signed. |
| | """ |
| | logger.debug(f"Generated prompt:\n{prompt}") |
| |
|
| | return generate_with_model(prompt, max_length=1400, temperature=0.3) |
| |
|
| | def review_legal_document(doc_text, doc_type, party_a, party_b): |
| | """ |
| | Reviews document: first with rule-based checks, then wording analysis. |
| | """ |
| | logger.info("Starting document review (rule-based and wording).") |
| |
|
| | |
| | rule_based_prompt = f""" |
| | You are a legal AI assistant reviewing a document. Provide a review, |
| | structured into the following numbered sections. Be concise and factual. Do NOT |
| | use Markdown. Use plain text labels for each section. |
| | |
| | Document text: |
| | \"\"\" |
| | {doc_text} |
| | \"\"\" |
| | |
| | Review Sections: |
| | |
| | 1) Parties and Authority: |
| | - Confirm the full legal names of all parties. |
| | - Make sure the people signing can legally commit their organizations. |
| | |
| | 2) Scope of Work / Obligations: |
| | - Check that the contract clearly describes what each side must do. |
| | - Look for deadlines, milestones, or deliverables. |
| | - Ensure everything is realistic and not overly vague. |
| | |
| | 3) Definitions and Key Terms: |
| | - See if there's a section that explains important terms. |
| | - Ensure those terms are used the same way throughout the contract. |
| | - Avoid or clarify any ambiguous language. |
| | |
| | 4) Payment Terms (If Applicable): |
| | - Check how much is owed, the currency, and when it's due. |
| | - Look for penalties, interest, or late fees. |
| | - Note how and when invoices are sent or paid. |
| | |
| | 5) Term and Termination: |
| | - Identify when the contract starts and ends. |
| | - Understand how it can be renewed. |
| | - See the conditions and notice required for ending the contract early. |
| | |
| | 6) Intellectual Property (IP) Rights: |
| | - Confirm who owns any work created under the agreement. |
| | - Note if licenses are granted for using the IP, and for how long. |
| | |
| | 7) Confidentiality and Privacy: |
| | - Check what is considered confidential information. |
| | - Look for exceptions (like already public info). |
| | - See how long the confidentiality rules apply. |
| | |
| | 8) Warranties and Representations: |
| | - Note any performance guarantees or quality promises. |
| | - Look for disclaimers (like "as is" clauses). |
| | |
| | 9) Indemnification: |
| | - See who will pay legal costs or damages if there's a lawsuit or claim. |
| | - Check any limits on what's covered. |
| | |
| | 10) Limitation of Liability: |
| | - Check if there's a maximum amount one side can claim in damages. |
| | - Look for excluded damages, like lost profits. |
| | |
| | 11) Dispute Resolution and Governing Law: |
| | - See if disputes go to arbitration, mediation, or court. |
| | - Note which state or country's laws will apply. |
| | |
| | 12) Force Majeure (Unforeseen Events): |
| | - Look for events like natural disasters or war that could suspend obligations. |
| | - See if there are notice requirements for these events. |
| | |
| | 13) Notices and Amendments: |
| | - Check how official notices must be sent (email, mail, etc.). |
| | - Find out how to properly change the contract (in writing, signatures, etc.). |
| | |
| | 14) Entire Agreement and Severability: |
| | - Confirm that this contract replaces all previous agreements. |
| | - Ensure that if one clause is invalid, the rest still stands. |
| | |
| | 15) Signatures and Dates: |
| | - Make sure the right people sign in their proper roles. |
| | - Verify the date of signature and when the contract goes into effect. |
| | |
| | 16) Ambiguities, Contradictions, and Hidden Clauses: |
| | - Watch for contradictory statements or clauses that conflict. |
| | - Beware of vague phrases like "best efforts" without clear guidelines. |
| | - Check for hidden or "buried" clauses in fine print or attachments. |
| | |
| | 17) Compliance and Regulatory Alignment: |
| | - Ensure the contract follows relevant laws and rules. |
| | - Check for industry-specific requirements. |
| | |
| | 18) Practical Considerations: |
| | - Make sure deadlines and other requirements are doable. |
| | - Confirm all negotiations are reflected in writing. |
| | - Avoid blank or undefined items (like fees or dates "to be decided"). |
| | """ |
| | logger.debug(f"Generated rule-based review prompt:\n{rule_based_prompt}") |
| |
|
| | try: |
| | rule_based_review = generate_with_model(rule_based_prompt, max_length=2000, temperature=0.3) |
| | except Exception as e: |
| | logger.exception("Error during rule-based review.") |
| | return f"Error during rule-based review: {e}" |
| |
|
| | |
| | wording_analysis_prompt = f""" |
| | You are a legal AI assistant. Analyze the following legal document for its wording: |
| | |
| | Document text: |
| | \"\"\" |
| | {doc_text} |
| | \"\"\" |
| | |
| | Provide a comprehensive analysis of the document's wording, covering these aspects for the ENTIRE document text: |
| | |
| | 1. **Clarity and Precision:** Identify ambiguous or vague language, and suggest improvements. |
| | 2. **Readability:** Assess the overall readability and suggest improvements for clarity, including sentence structure and complexity. |
| | 3. **Formal Tone:** Check if the language maintains a formal and professional tone appropriate for a legal document, and suggest changes if needed. |
| | 4. **Consistency:** Ensure consistent use of terms and phrasing throughout the document. Point out any inconsistencies. |
| | 5. **Redundancy:** Identify any unnecessary repetition of words or phrases. |
| | 6. **Jargon and Technical Terms:** Identify jargon or technical terms that might be unclear to a non-expert, and suggest clearer alternatives where appropriate. |
| | 7. **Overall Recommendations:** Give overall recommendations for improving the document's wording. |
| | |
| | Provide your analysis in plain text, without using Markdown. Label each section of your analysis clearly (e.g., "Clarity and Precision:", "Readability:", etc.). |
| | """ |
| | logger.debug(f"Generated wording analysis prompt:\n{wording_analysis_prompt}") |
| |
|
| | try: |
| | wording_analysis = generate_with_model(wording_analysis_prompt, max_length=1000, temperature=0.3) |
| | except Exception as e: |
| | logger.exception("Error during wording analysis.") |
| | return f"Error during wording analysis: {e}" |
| |
|
| | combined_review = f"Rule-Based Analysis:\n\n{rule_based_review}\n\nWording Analysis:\n\n{wording_analysis}" |
| | return combined_review |
| |
|
| | |
| | |
| | |
| |
|
| | def parse_bytesio(file_data: BytesIO) -> str: |
| | """Parses a BytesIO object representing a PDF or DOCX.""" |
| | logger.info("Parsing BytesIO object...") |
| | try: |
| | |
| | try: |
| | doc_obj = docx.Document(file_data) |
| | return "\n".join([para.text for para in doc_obj.paragraphs]).strip() |
| | except docx.opc.exceptions.PackageNotFoundError: |
| | logger.info("BytesIO is not DOCX, trying PDF.") |
| | file_data.seek(0) |
| | try: |
| | pdf_reader = PyPDF2.PdfReader(file_data) |
| | return "\n".join([page.extract_text() for page in pdf_reader.pages if page.extract_text()]).strip() |
| | except Exception as e: |
| | logger.exception(f"Error parsing BytesIO as PDF: {e}") |
| | return f"Error parsing BytesIO as PDF: {e}" |
| | except Exception as e: |
| | logger.exception(f"Error processing BytesIO: {e}") |
| | return f"Error processing file content: {e}" |
| | except Exception as e: |
| | logger.exception(f"Error parsing BytesIO: {e}") |
| | return f"Error parsing BytesIO: {e}" |
| |
|
| | def parse_uploaded_file_path(file_data) -> str: |
| | """Takes file data, determines type, extracts text.""" |
| | if not file_data: |
| | logger.warning("No file provided.") |
| | return "" |
| | if isinstance(file_data, str): |
| | file_path = file_data |
| | logger.info(f"Received filepath: {file_path}") |
| | elif isinstance(file_data, dict) and 'name' in file_data: |
| | file_path = file_data['name'] |
| | logger.info(f"Received file object with name: {file_path}") |
| | elif isinstance(file_data, (BytesIO, IOBase)): |
| | return parse_bytesio(file_data) |
| | else: |
| | logger.error(f"Unexpected file_data type: {type(file_data)}") |
| | return "Error: Unexpected file data format." |
| |
|
| | logger.info(f"Attempting to parse file at {file_path}") |
| | try: |
| | _, ext = os.path.splitext(file_path) |
| | ext = ext.lower() |
| | if ext == ".pdf": |
| | with open(file_path, "rb") as f: |
| | pdf_reader = PyPDF2.PdfReader(f) |
| | return "\n".join([page.extract_text() for page in pdf_reader.pages if page.extract_text()]).strip() |
| | elif ext == ".docx": |
| | doc_obj = docx.Document(file_path) |
| | return "\n".join([para.text for para in doc_obj.paragraphs]).strip() |
| | else: |
| | return "Unsupported file format." |
| | except Exception as e: |
| | logger.exception(f"Error parsing file: {e}") |
| | return f"Error parsing file: {e}" |
| | finally: |
| | pass |
| |
|
| | |
| | |
| | |
| |
|
| | def clean_markdown(text): |
| | """Removes common Markdown formatting.""" |
| | if not text: return "" |
| | text = re.sub(r'^#+\s+', '', text, flags=re.MULTILINE) |
| | text = re.sub(r'(\*\*|__)(.*?)(\*\*|__)', r'\2', text) |
| | text = re.sub(r'(\*|_)(.*?)(\*|_)', r'\2', text) |
| | text = re.sub(r'^[\-\+\*]\s+', '', text, flags=re.MULTILINE) |
| | text = re.sub(r'^\d+\.\s+', '', text, flags=re.MULTILINE) |
| | text = re.sub(r'^[-_*]{3,}$', '', text, flags=re.MULTILINE) |
| | text = re.sub(r'!\[(.*?)\]\((.*?)\)', '', text) |
| | text = re.sub(r'\[(.*?)\]\((.*?)\)', r'\1', text) |
| | return text.strip() |
| |
|
| | def create_and_save_docx(doc_text, review_text=None, doc_type="Unknown", party_a="Party A", party_b="Party B"): |
| | """Creates DOCX, adds review, saves to temp file, returns path.""" |
| | logger.debug("Creating and saving DOCX.") |
| | document = docx.Document() |
| |
|
| | now = datetime.datetime.now() |
| | timestamp = now.strftime("%Y%m%d_%H%M%S") |
| | file_name = f"HF_AI_Review_{doc_type}_{timestamp}.docx" |
| |
|
| | title = f"DocumentCogito Analysis of {doc_type} between companies {party_a} and {party_b}" |
| | document.add_heading(title, level=1) |
| |
|
| | if doc_text: |
| | document.add_heading("Generated Document", level=2) |
| | for para in clean_markdown(doc_text).split("\n"): |
| | document.add_paragraph(para) |
| |
|
| | if review_text: |
| | document.add_heading("LLM Review", level=2) |
| | for section in review_text.split("\n\n"): |
| | if section.startswith("Rule-Based Analysis:"): |
| | analysis_heading = document.add_paragraph() |
| | analysis_run = analysis_heading.add_run("Rule-Based Analysis") |
| | analysis_run.font.size = Pt(14) |
| | analysis_run.font.color.rgb = RGBColor(0xFF, 0x00, 0x00) |
| | for para in section[len("Rule-Based Analysis:"):].split("\n"): |
| | if re.match(r"^\d+\)", para): |
| | p = document.add_paragraph(style='List Number') |
| | p.add_run(para).font.color.rgb = RGBColor(0xFF, 0x00, 0x00) |
| | else: |
| | document.add_paragraph(para) |
| |
|
| | elif section.startswith("Wording Analysis:"): |
| | analysis_heading = document.add_paragraph() |
| | analysis_run = analysis_heading.add_run("Wording Analysis") |
| | analysis_run.font.size = Pt(14) |
| | analysis_run.font.color.rgb = RGBColor(0xFF, 0x00, 0x00) |
| | for para in section[len("Wording Analysis:"):].split("\n"): |
| | document.add_paragraph(para) |
| | else: |
| | document.add_paragraph(section) |
| |
|
| | with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{file_name}") as tmpfile: |
| | document.save(tmpfile.name) |
| | logger.debug(f"DOCX saved to: {tmpfile.name}") |
| | return tmpfile.name |
| |
|
| | |
| | |
| | |
| |
|
| | def generate_document_interface(doc_type, party_a, party_b, context, country): |
| | """Handles document generation.""" |
| | logger.info(f"User requested doc generation: {doc_type}, {country}") |
| | doc_text = generate_legal_document(doc_type, party_a, party_b, context, country) |
| | if doc_text.startswith("Error"): |
| | return doc_text, None |
| | docx_file_path = create_and_save_docx(doc_text, doc_type=doc_type, party_a=party_a, party_b=party_b) |
| | return doc_text, docx_file_path |
| |
|
| | def review_document_interface(file_data, doc_type, party_a, party_b): |
| | """Handles document review.""" |
| | logger.info("User requested review.") |
| | if not file_data: |
| | return "No file uploaded.", None |
| |
|
| | original_text = parse_uploaded_file_path(file_data) |
| | if original_text.startswith("Error") or original_text.startswith("Unsupported"): |
| | return original_text, None |
| |
|
| | review_text = review_legal_document(original_text, doc_type, party_a, party_b) |
| | if review_text.startswith("Error"): |
| | return review_text, None |
| |
|
| | docx_file_path = create_and_save_docx(None, review_text, doc_type, party_a, party_b) |
| | return review_text, docx_file_path |
| |
|
| | |
| | |
| | |
| | |
| | custom_css = """ |
| | .tab-one { |
| | background-color: #D1EEFC; /* Light blue */ |
| | color: #333; |
| | } |
| | .tab-two { |
| | background-color: #FCEED1; /* Light orange */ |
| | color: #333; |
| | } |
| | /* If you want to style the tab label differently, you may need to target |
| | specific child elements (like a .tab__header) within the class. */ |
| | """ |
| |
|
| | def build_app(): |
| | with gr.Blocks(css=custom_css) as demo: |
| | gr.Markdown( |
| | """ |
| | # UST Global Legal Document Analyzer (Hugging Face Version) |
| | |
| | **Review an Existing MOU, SOW, MSA in PDF/DOCX format**: Upload a document for analysis. |
| | |
| | **Disclaimer**: This tool provides assistance but is not a substitute for professional legal advice. |
| | """ |
| | ) |
| | with gr.Tabs(selected=1): |
| | with gr.Tab("Generate Document", visible=False): |
| | doc_type = gr.Dropdown(label="Document Type", choices=["MOU", "MSA", "SoW", "NDA"], value="MOU") |
| | party_a = gr.Textbox(label="Party A Name", placeholder="e.g., Tech Innovations LLC") |
| | party_b = gr.Textbox(label="Party B Name", placeholder="e.g., Global Consulting Corp") |
| | context = gr.Textbox(label="Context/Brief", placeholder="Short summary of the agreement...") |
| | country = gr.Dropdown(label="Governing Law (Country)", choices=["India", "Malaysia", "US", "UK", "Singapore", "Japan"], value="India") |
| | gen_button = gr.Button("Generate Document") |
| | gen_output_text = gr.Textbox(label="Generated Document", lines=15, placeholder="Generated document will appear here...") |
| | gen_output_file = gr.File(label="Download DOCX", type="filepath") |
| | gen_button.click( |
| | generate_document_interface, |
| | inputs=[doc_type, party_a, party_b, context, country], |
| | outputs=[gen_output_text, gen_output_file] |
| | ) |
| |
|
| | with gr.Tab("Review Document", elem_classes="tab-one", id=1): |
| | |
| | doc_type_review = gr.Dropdown(label="Document Type", choices=["MOU", "MSA", "SoW", "NDA"], value="MOU", visible=False) |
| | party_a_review = gr.Textbox(label="Party A Name", visible=False) |
| | party_b_review = gr.Textbox(label="Party B Name", visible=False) |
| |
|
| | file_input = gr.File(label="Upload PDF/DOCX for Review", type="filepath") |
| | review_button = gr.Button("Review Document") |
| | review_output_text = gr.Textbox(label="Review", lines=15, placeholder="Review will appear here...") |
| | review_output_file = gr.File(label="Download Reviewed DOCX", type="filepath") |
| | review_button.click( |
| | review_document_interface, |
| | inputs=[file_input, doc_type_review, party_a_review, party_b_review], |
| | outputs=[review_output_text, review_output_file] |
| | ) |
| | |
| | gen_button.click(lambda x, y, z: (x, y, z), [doc_type, party_a, party_b], [doc_type_review, party_a_review, party_b_review]) |
| |
|
| | gr.Markdown("**Note:** Scanned PDFs may not parse correctly. .docx is generally preferred.") |
| | return demo |
| |
|
| | |
| | if __name__ == "__main__": |
| | |
| | logger.info("Initializing Gradio interface...") |
| | demo = build_app() |
| | logger.info("Launching Gradio app.") |
| | demo.launch(debug=True,share=False) |