diff --git a/collector/processLink/convert/generic.js b/collector/processLink/convert/generic.js index c6431d733..1292b850c 100644 --- a/collector/processLink/convert/generic.js +++ b/collector/processLink/convert/generic.js @@ -29,7 +29,7 @@ async function scrapeGenericUrl(link) { docAuthor: "no author found", description: "No description found.", docSource: "URL link uploaded by the user.", - chunkSource: slugify(link) + ".html", + chunkSource: `link://${link}`, published: new Date().toLocaleString(), wordCount: content.split(" ").length, pageContent: content, diff --git a/collector/processSingleFile/convert/asAudio.js b/collector/processSingleFile/convert/asAudio.js index 7688d7b85..15ae5cf00 100644 --- a/collector/processSingleFile/convert/asAudio.js +++ b/collector/processSingleFile/convert/asAudio.js @@ -58,7 +58,7 @@ async function asAudio({ fullFilePath = "", filename = "" }) { docAuthor: "no author found", description: "No description found.", docSource: "pdf file uploaded by the user.", - chunkSource: filename, + chunkSource: "", published: createdDate(fullFilePath), wordCount: content.split(" ").length, pageContent: content, diff --git a/collector/processSingleFile/convert/asDocx.js b/collector/processSingleFile/convert/asDocx.js index b4fe7d2c9..48649c6f0 100644 --- a/collector/processSingleFile/convert/asDocx.js +++ b/collector/processSingleFile/convert/asDocx.js @@ -39,7 +39,7 @@ async function asDocX({ fullFilePath = "", filename = "" }) { docAuthor: "no author found", description: "No description found.", docSource: "pdf file uploaded by the user.", - chunkSource: filename, + chunkSource: "", published: createdDate(fullFilePath), wordCount: content.split(" ").length, pageContent: content, diff --git a/collector/processSingleFile/convert/asMbox.js b/collector/processSingleFile/convert/asMbox.js index f62f6b2ba..4adde23ec 100644 --- a/collector/processSingleFile/convert/asMbox.js +++ b/collector/processSingleFile/convert/asMbox.js @@ -49,7 +49,7 @@ async function asMbox({ fullFilePath = "", filename = "" }) { docAuthor: mail?.from?.text, description: "No description found.", docSource: "Mbox message file uploaded by the user.", - chunkSource: filename, + chunkSource: "", published: createdDate(fullFilePath), wordCount: content.split(" ").length, pageContent: content, diff --git a/collector/processSingleFile/convert/asOfficeMime.js b/collector/processSingleFile/convert/asOfficeMime.js index 45b316610..b6c3c0601 100644 --- a/collector/processSingleFile/convert/asOfficeMime.js +++ b/collector/processSingleFile/convert/asOfficeMime.js @@ -34,7 +34,7 @@ async function asOfficeMime({ fullFilePath = "", filename = "" }) { docAuthor: "no author found", description: "No description found.", docSource: "Office file uploaded by the user.", - chunkSource: filename, + chunkSource: "", published: createdDate(fullFilePath), wordCount: content.split(" ").length, pageContent: content, diff --git a/collector/processSingleFile/convert/asPDF.js b/collector/processSingleFile/convert/asPDF.js index b89b97411..560c4939f 100644 --- a/collector/processSingleFile/convert/asPDF.js +++ b/collector/processSingleFile/convert/asPDF.js @@ -44,7 +44,7 @@ async function asPDF({ fullFilePath = "", filename = "" }) { docAuthor: docs[0]?.metadata?.pdf?.info?.Creator || "no author found", description: "No description found.", docSource: "pdf file uploaded by the user.", - chunkSource: filename, + chunkSource: "", published: createdDate(fullFilePath), wordCount: content.split(" ").length, pageContent: content, diff --git a/collector/processSingleFile/convert/asTxt.js b/collector/processSingleFile/convert/asTxt.js index cf7260d4b..53987f247 100644 --- a/collector/processSingleFile/convert/asTxt.js +++ b/collector/processSingleFile/convert/asTxt.js @@ -34,7 +34,7 @@ async function asTxt({ fullFilePath = "", filename = "" }) { docAuthor: "Unknown", // TODO: Find a better author description: "Unknown", // TODO: Find a better description docSource: "a text file uploaded by the user.", - chunkSource: filename, + chunkSource: "", published: createdDate(fullFilePath), wordCount: content.split(" ").length, pageContent: content, diff --git a/collector/utils/extensions/GithubRepo/index.js b/collector/utils/extensions/GithubRepo/index.js index d459e6357..e5925f1d4 100644 --- a/collector/utils/extensions/GithubRepo/index.js +++ b/collector/utils/extensions/GithubRepo/index.js @@ -45,8 +45,8 @@ async function loadGithubRepo(args) { title: doc.metadata.source, docAuthor: repo.author, description: "No description found.", - docSource: repo.repo, - chunkSource: doc.metadata.source, + docSource: doc.metadata.source, + chunkSource: `link://${doc.metadata.repository}/blob/${doc.metadata.branch}/${doc.metadata.source}`, published: new Date().toLocaleString(), wordCount: doc.pageContent.split(" ").length, pageContent: doc.pageContent, diff --git a/collector/utils/extensions/YoutubeTranscript/index.js b/collector/utils/extensions/YoutubeTranscript/index.js index 22540dbb0..c0e198069 100644 --- a/collector/utils/extensions/YoutubeTranscript/index.js +++ b/collector/utils/extensions/YoutubeTranscript/index.js @@ -68,7 +68,7 @@ async function loadYouTubeTranscript({ url }) { docAuthor: metadata.author, description: metadata.description, docSource: url, - chunkSource: url, + chunkSource: `link://${url}`, published: new Date().toLocaleString(), wordCount: content.split(" ").length, pageContent: content, diff --git a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/Citation/index.jsx b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/Citation/index.jsx index 697c55214..85b7fb7bb 100644 --- a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/Citation/index.jsx +++ b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/Citation/index.jsx @@ -1,22 +1,31 @@ import { memo, useState } from "react"; -import { X } from "@phosphor-icons/react"; import { v4 } from "uuid"; import { decode as HTMLDecode } from "he"; import { CaretRight, FileText } from "@phosphor-icons/react"; import truncate from "truncate"; import ModalWrapper from "@/components/ModalWrapper"; +import { middleTruncate } from "@/utils/directories"; +import { + ArrowSquareOut, + GithubLogo, + Link, + X, + YoutubeLogo, +} from "@phosphor-icons/react"; function combineLikeSources(sources) { const combined = {}; sources.forEach((source) => { - const { id, title, text } = source; + const { id, title, text, chunkSource = "" } = source; if (combined.hasOwnProperty(title)) { combined[title].text += `\n\n ---- Chunk ${id || ""} ---- \n\n${text}`; combined[title].references += 1; + combined[title].chunkSource = chunkSource; } else { - combined[title] = { title, text, references: 1 }; + combined[title] = { title, text, chunkSource, references: 1 }; } }); + return Object.values(combined); } @@ -41,10 +50,10 @@ export default function Citations({ sources = [] }) { /> {open && ( -
+
{combineLikeSources(sources).map((source) => ( setSelectedSource(source)} /> @@ -64,16 +73,18 @@ export default function Citations({ sources = [] }) { const Citation = memo(({ source, onClick }) => { const { title } = source; if (!title) return null; - - const truncatedTitle = truncateMiddle(title); + const chunkSourceInfo = parseChunkSource(source); + const truncatedTitle = chunkSourceInfo?.text ?? middleTruncate(title, 25); + const CitationIcon = ICONS.hasOwnProperty(chunkSourceInfo?.icon) + ? ICONS[chunkSourceInfo.icon] + : ICONS.file; return (
- +

{truncatedTitle}

); @@ -99,14 +110,31 @@ function SkeletonLine() { function CitationDetailModal({ source, onClose }) { const { references, title, text } = source; + const { isUrl, text: webpageUrl, href: linkTo } = parseChunkSource(source); return (
-

- {truncate(title, 45)} -

+
+ {isUrl ? ( + +

+ {webpageUrl} + +

+
+ ) : ( +

+ {truncate(title, 45)} +

+ )} +
{references > 1 && (

Referenced {references} times. @@ -141,11 +169,47 @@ function CitationDetailModal({ source, onClose }) { ); } -function truncateMiddle(title) { - if (title.length <= 18) return title; +const ICONS = { + file: FileText, + link: Link, + youtube: YoutubeLogo, + github: GithubLogo, +}; - const startStr = title.substr(0, 9); - const endStr = title.substr(-9); +// Show the correct title and/or display text for citations +// which contain valid outbound links that can be clicked by the +// user when viewing a citation. Optionally allows various icons +// to show distinct types of sources. +function parseChunkSource({ title = "", chunkSource = "" }) { + const nullResponse = { + isUrl: false, + text: null, + href: null, + icon: "file", + }; - return `${startStr}...${endStr}`; + if (!chunkSource.startsWith("link://")) return nullResponse; + try { + const url = new URL(chunkSource.split("link://")[1]); + let text = url.host + url.pathname; + let icon = "link"; + + if (url.host.includes("youtube.com")) { + text = title; + icon = "youtube"; + } + + if (url.host.includes("github.com")) { + text = title; + icon = "github"; + } + + return { + isUrl: true, + href: url.toString(), + text, + icon, + }; + } catch {} + return nullResponse; }