add

futurice · Dec 12, 2024 · e4a53e0 · e4a53e0
1 parent 4044337
commit e4a53e0
Show file tree

Hide file tree

Showing 16 changed files with 2,546 additions and 13 deletions.
diff --git a/.python-version b/.python-version
@@ -0,0 +1 @@
+3.12
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,15 @@
+[project]
+name = "edge-10x-prompt-flows"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "jinja2>=3.1.4",
+    "promptflow-azure>=1.16.2",
+    "promptflow>=1.16.2",
+    "promptflow-sdk>=0.0.1",
+    "promptflow-tools>=1.4.0",
+    "openai>=1.56.2",
+    "python-dotenv>=1.0.1",
+]
diff --git a/use_case_research_assistant/flows/bulk/make_google_search_queries.py b/use_case_research_assistant/flows/bulk/make_google_search_queries.py
@@ -0,0 +1,79 @@
+from typing import List, Dict
+import json
+from promptflow import tool
+
+@tool
+def make_google_search_queries(reformulated_questions: str, company_name: str, website_filters: List[str]) -> List[str]:
+    """
+    Takes reformulated questions as JSON string and creates multiple search queries with website filters
+    
+    Args:
+        reformulated_questions: JSON string containing reformulated questions from the LLM
+        company_name: Company name to focus the search on
+        website_filters: List of website domains to restrict search to
+    
+    Returns:
+        List of search queries with different variations and site filters
+    """
+    # Parse JSON string to get queries
+    try:
+        questions_dict = json.loads(reformulated_questions)
+        queries = questions_dict.get('queries', [])
+    except json.JSONDecodeError:
+        print(f"Warning: Could not parse JSON: {reformulated_questions}")
+        return []
+
+    # Create site-specific queries for each domain and query
+    site_queries = []
+    for domain in website_filters:
+        for base_query in queries:
+            # Skip empty or invalid queries
+            if not base_query or not isinstance(base_query, str):
+                continue
+
+            # Clean the query and ensure company name is included
+            base_query = base_query.strip()
+            if company_name.lower() not in base_query.lower():
+                base_query = f"{company_name} {base_query}"
+
+            # Basic site-restricted query
+            site_queries.append(f'site:{domain} {base_query}')
+
+            # Add variations with common search modifiers, always including company name
+            site_queries.extend([
+                f'site:{domain} {base_query} news',
+                f'site:{domain} {base_query} blog',
+                f'site:{domain} {base_query} press release',
+                f'site:{domain} {base_query} report',
+                f'site:{domain} {base_query} whitepaper',
+                f'site:{domain} {base_query} case study',
+                f'site:{domain} {base_query} webinar',
+                f'site:{domain} {base_query} podcast',
+                f'site:{domain} {base_query} video',
+                f'site:{domain} {base_query} infographic'
+            ])
+
+    # Add some non-site-restricted queries for broader context
+    general_queries = []
+    for base_query in queries:
+        if not base_query or not isinstance(base_query, str):
+            continue
+        base_query = base_query.strip()
+        if company_name.lower() not in base_query.lower():
+            base_query = f"{company_name} {base_query}"
+
+        general_queries.extend([
+            f'{base_query} announcement',
+            f'{base_query} blog',
+            f'{base_query} documentation'
+        ])
+
+    # Combine all queries and remove duplicates while preserving order
+    all_queries = []
+    seen = set()
+    for query in site_queries + general_queries:
+        if query not in seen:
+            all_queries.append(query)
+            seen.add(query)
+
+    return all_queries 
diff --git a/use_case_research_assistant/flows/evaluation/flow.dag.yaml b/use_case_research_assistant/flows/evaluation/flow.dag.yaml
@@ -12,7 +12,7 @@ inputs:
 outputs:
   output_answer:
     type: object
-    reference: ${search_each_question.output}
+    reference: ${search_question.output}
 nodes:
 - name: search_question
   type: python

diff --git a/use_case_research_assistant/flows/standard/QUESTION_EXPANDER.jinja2 b/use_case_research_assistant/flows/standard/QUESTION_EXPANDER.jinja2
@@ -1,6 +1,6 @@
 system:
 You are a research assistant. Your task is to rephrase the given question into a more specific question.
-
+It's important to return the response as a JSON object with a "queries" array containing the search queries.
 user:
 Question: {{question}}
 Sub-questions:
diff --git a/use_case_research_assistant/flows/standard/QUESTION_REFORMULATION_1.jinja2 b/use_case_research_assistant/flows/standard/QUESTION_REFORMULATION_1.jinja2
@@ -0,0 +1,42 @@
+system:
+You are an expert Query rewriter and reformulator. Here is a given question to reformulate.
+These reformulated queries will be used for researching the internet. The objective is to get as good search queries as possible.
+
+Do not use company name placeholders when reformulating the question. Just reformulate the question in a logical and SEO way.
+
+Here is how you will think about the question:
+1. Understand the question thoroughly
+2. Identify key concepts and queries in the question
+3. Extract the main concepts and related concepts from the question
+4. Break down the original question into several search queries
+4. Reformulate the search queries into a more specific and targeted search query
+
+ Here are some good examples that we want to follow. Please study them and make similar reformulations.
+
+    #Original query
+    Has the company Vattenfall adopted reference architectures or best practices to guide its cloud deployments?
+    #reformulated queries
+    - Vattenfall cloud deployment reference architectures
+    - Vattenfall best practices cloud deployment
+    - Has Vattenfall adopted cloud deployment standards
+
+    #Original queries
+    How well is the company Vattenfall managing risks associated with cloud migration (e.g., downtime, data loss)?
+    #reformulated query
+    - Vattenfall cloud migration risk management strategies
+    - Challenges of cloud migration for Vattenfall
+    - Vattenfall case study on cloud migration success and failures
+
+    #Original queries
+    How well are external market or competitive factors being integrated into cloud discussions of Vattenfall?
+    #reformulated query
+    - Integration of external market factors in cloud discussions
+    - Competitive analysis in cloud computing
+    - Incorporating market trends in cloud strategy
+
+It's important to return the response as a JSON object with a "queries" array containing the search queries.
+
+user:
+Reformulate the following query:
+<query>: {{question}} </query>
+Formulated_queries:
diff --git a/use_case_research_assistant/flows/standard/QUESTION_REFORMULATION_2.jinja2 b/use_case_research_assistant/flows/standard/QUESTION_REFORMULATION_2.jinja2
@@ -0,0 +1,17 @@
+system: You are a research assistant helping to reformulate questions to get specific and focused search results about {{company_name}} products and services.
+
+user: Please reformulate this question into multiple specific search queries about {{company_name}}: {{question}}
+Focus on specific product names, features, and technical details.
+
+Important: Return your response as a JSON object with a "queries" array containing the search queries.
+
+assistant: {
+  "queries": [
+    "{{company_name}} {{question}} product names specifications",
+    "{{company_name}} {{question}} latest features updates",
+    "{{company_name}} {{question}} technical capabilities",
+    "{{company_name}} {{question}} pricing tiers comparison",
+    "{{company_name}} {{question}} system requirements",
+    "{{company_name}} {{question}} integration options"
+  ]
+}
diff --git a/use_case_research_assistant/flows/standard/QUESTION_REFORMULATION_3.jinja2 b/use_case_research_assistant/flows/standard/QUESTION_REFORMULATION_3.jinja2
@@ -0,0 +1,17 @@
+system: You are a comprehensive research assistant helping to explore all aspects of {{company_name}} products and services.
+
+user: Please reformulate this question into multiple comprehensive search queries about {{company_name}}: {{question}}
+Consider multiple angles including business impact, use cases, and industry applications.
+
+Important: Return your response as a JSON object with a "queries" array containing the search queries.
+
+assistant: {
+  "queries": [
+    "{{company_name}} {{question}} enterprise solutions overview",
+    "{{company_name}} {{question}} customer success stories",
+    "{{company_name}} {{question}} industry use cases",
+    "{{company_name}} {{question}} business benefits ROI",
+    "{{company_name}} {{question}} market analysis comparison",
+    "{{company_name}} {{question}} future roadmap plans"
+  ]
+}
diff --git a/use_case_research_assistant/flows/standard/QUESTION_REFORMULATION_4.jinja2 b/use_case_research_assistant/flows/standard/QUESTION_REFORMULATION_4.jinja2
@@ -0,0 +1,17 @@
+system: You are a technical research assistant focusing on detailed technical aspects of {{company_name}} products and services.
+
+user: Please reformulate this question into multiple technical search queries about {{company_name}}: {{question}}
+Focus on technical specifications, documentation, and implementation details.
+
+Important: Return your response as a JSON object with a "queries" array containing the search queries.
+
+assistant: {
+  "queries": [
+    "{{company_name}} {{question}} technical documentation guide",
+    "{{company_name}} {{question}} API reference implementation",
+    "{{company_name}} {{question}} architecture overview",
+    "{{company_name}} {{question}} deployment configuration",
+    "{{company_name}} {{question}} best practices guidelines",
+    "{{company_name}} {{question}} security compliance requirements"
+  ]
+}
diff --git a/use_case_research_assistant/flows/standard/flow.dag.yaml b/use_case_research_assistant/flows/standard/flow.dag.yaml
@@ -2,31 +2,62 @@ $schema: https://azuremlschemas.azureedge.net/promptflow/latest/Flow.schema.json
 inputs:
   question:
     type: string
-    default: What's the population of Finland?
-outputs:
-  answer:
+    default: "What are some generative ai products?"
+  company_name:
     type: string
-    reference: ${search_question.output}
+    default: "Microsoft"
+  website_filters:
+    type: list
+    default: ["www.microsoft.com", "www.learn.microsoft.com"]
+outputs:
+  answers:
+    type: list
+    reference: ${make_google_search_queries.output}
 nodes:
-- name: QUESTION_EXPANDER
+- name: QUESTION_REFORMULATION
   type: llm
   source:
     type: code
-    path: QUESTION_EXPANDER.jinja2
+    path: QUESTION_REFORMULATION_1.jinja2
   inputs:
-    deployment_name: gpt-35-turbo
-    max_tokens: 64
+    deployment_name: gpt-4o
+    max_tokens: 1000
     question: ${inputs.question}
+    company_name: ${inputs.company_name}
+    response_format: {"type": "json_object"}
   provider: AzureOpenAI
   connection: aoai
   api: chat
   module: promptflow.tools.aoai
-- name: search_question
+  variants:
+    variant_0:
+      source:
+        type: code
+        path: QUESTION_REFORMULATION_1.jinja2
+    variant_1:
+      source:
+        type: code
+        path: QUESTION_REFORMULATION_2.jinja2
+    variant_2:
+      source:
+        type: code 
+        path: QUESTION_REFORMULATION_3.jinja2
+      inputs:
+        temperature: 0.7
+    variant_3:
+      source:
+        type: code
+        path: QUESTION_REFORMULATION_4.jinja2
+      inputs:
+        temperature: 0.3
+- name: make_google_search_queries
   type: python
   source:
     type: code
-    path: search_question.py
+    path: make_google_search_queries.py
   inputs:
-    question: ${QUESTION_EXPANDER.output}
+    reformulated_questions: ${QUESTION_REFORMULATION.output}
+    company_name: ${inputs.company_name}
+    website_filters: ${inputs.website_filters}
 environment:
   python_requirements_txt: requirements.txt
diff --git a/use_case_research_assistant/flows/standard/make_google_search_queries.py b/use_case_research_assistant/flows/standard/make_google_search_queries.py
@@ -0,0 +1,79 @@
+from typing import List, Dict
+import json
+from promptflow import tool
+
+@tool
+def make_google_search_queries(reformulated_questions: str, company_name: str, website_filters: List[str]) -> List[str]:
+    """
+    Takes reformulated questions as JSON string and creates multiple search queries with website filters
+    
+    Args:
+        reformulated_questions: JSON string containing reformulated questions from the LLM
+        company_name: Company name to focus the search on
+        website_filters: List of website domains to restrict search to
+    
+    Returns:
+        List of search queries with different variations and site filters
+    """
+    # Parse JSON string to get queries
+    try:
+        questions_dict = json.loads(reformulated_questions)
+        queries = questions_dict.get('queries', [])
+    except json.JSONDecodeError:
+        print(f"Warning: Could not parse JSON: {reformulated_questions}")
+        return []
+
+    # Create site-specific queries for each domain and query
+    site_queries = []
+    for domain in website_filters:
+        for base_query in queries:
+            # Skip empty or invalid queries
+            if not base_query or not isinstance(base_query, str):
+                continue
+
+            # Clean the query and ensure company name is included
+            base_query = base_query.strip()
+            if company_name.lower() not in base_query.lower():
+                base_query = f' "{company_name}" {base_query}'
+
+            # Basic site-restricted query
+            site_queries.append(f'site:{domain} {base_query}')
+
+            # Add variations with common search modifiers, always including company name
+            site_queries.extend([
+                f'site:{domain} {base_query} news',
+                f'site:{domain} {base_query} blog',
+                f'site:{domain} {base_query} press release',
+                f'site:{domain} {base_query} report',
+                f'site:{domain} {base_query} whitepaper',
+                f'site:{domain} {base_query} case study',
+                f'site:{domain} {base_query} webinar',
+                f'site:{domain} {base_query} podcast',
+                f'site:{domain} {base_query} video',
+                f'site:{domain} {base_query} infographic'
+            ])
+
+    # Add some non-site-restricted queries for broader context
+    general_queries = []
+    for base_query in queries:
+        if not base_query or not isinstance(base_query, str):
+            continue
+        base_query = base_query.strip()
+        if company_name.lower() not in base_query.lower():
+            base_query = f'"{company_name}" {base_query}'
+
+        general_queries.extend([
+            f'{base_query} announcement',
+            f'{base_query} blog',
+            f'{base_query} documentation'
+        ])
+
+    # Combine all queries and remove duplicates while preserving order
+    all_queries = []
+    seen = set()
+    for query in site_queries + general_queries:
+        if query not in seen:
+            all_queries.append(query)
+            seen.add(query)
+
+    return all_queries 
diff --git a/use_case_research_assistant/flows/standard_old/QUESTION_EXPANDER.jinja2 b/use_case_research_assistant/flows/standard_old/QUESTION_EXPANDER.jinja2
@@ -0,0 +1,6 @@
+system:
+You are a research assistant. Your task is to rephrase the given question into a more specific question.
+
+user:
+Question: {{question}}
+Sub-questions: