689 links in citation (#715)

* Include links in citations
force ChunkSource key to retain this information
old links will be unsupported

* show special icons depending on source

* remove console log

* reset server documents writeTo
This commit is contained in:
Timothy Carambat 2024-02-13 14:11:57 -08:00 committed by GitHub
parent f4b09a8c79
commit d52f8aafd4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 92 additions and 28 deletions

View File

@ -29,7 +29,7 @@ async function scrapeGenericUrl(link) {
docAuthor: "no author found", docAuthor: "no author found",
description: "No description found.", description: "No description found.",
docSource: "URL link uploaded by the user.", docSource: "URL link uploaded by the user.",
chunkSource: slugify(link) + ".html", chunkSource: `link://${link}`,
published: new Date().toLocaleString(), published: new Date().toLocaleString(),
wordCount: content.split(" ").length, wordCount: content.split(" ").length,
pageContent: content, pageContent: content,

View File

@ -58,7 +58,7 @@ async function asAudio({ fullFilePath = "", filename = "" }) {
docAuthor: "no author found", docAuthor: "no author found",
description: "No description found.", description: "No description found.",
docSource: "pdf file uploaded by the user.", docSource: "pdf file uploaded by the user.",
chunkSource: filename, chunkSource: "",
published: createdDate(fullFilePath), published: createdDate(fullFilePath),
wordCount: content.split(" ").length, wordCount: content.split(" ").length,
pageContent: content, pageContent: content,

View File

@ -39,7 +39,7 @@ async function asDocX({ fullFilePath = "", filename = "" }) {
docAuthor: "no author found", docAuthor: "no author found",
description: "No description found.", description: "No description found.",
docSource: "pdf file uploaded by the user.", docSource: "pdf file uploaded by the user.",
chunkSource: filename, chunkSource: "",
published: createdDate(fullFilePath), published: createdDate(fullFilePath),
wordCount: content.split(" ").length, wordCount: content.split(" ").length,
pageContent: content, pageContent: content,

View File

@ -49,7 +49,7 @@ async function asMbox({ fullFilePath = "", filename = "" }) {
docAuthor: mail?.from?.text, docAuthor: mail?.from?.text,
description: "No description found.", description: "No description found.",
docSource: "Mbox message file uploaded by the user.", docSource: "Mbox message file uploaded by the user.",
chunkSource: filename, chunkSource: "",
published: createdDate(fullFilePath), published: createdDate(fullFilePath),
wordCount: content.split(" ").length, wordCount: content.split(" ").length,
pageContent: content, pageContent: content,

View File

@ -34,7 +34,7 @@ async function asOfficeMime({ fullFilePath = "", filename = "" }) {
docAuthor: "no author found", docAuthor: "no author found",
description: "No description found.", description: "No description found.",
docSource: "Office file uploaded by the user.", docSource: "Office file uploaded by the user.",
chunkSource: filename, chunkSource: "",
published: createdDate(fullFilePath), published: createdDate(fullFilePath),
wordCount: content.split(" ").length, wordCount: content.split(" ").length,
pageContent: content, pageContent: content,

View File

@ -44,7 +44,7 @@ async function asPDF({ fullFilePath = "", filename = "" }) {
docAuthor: docs[0]?.metadata?.pdf?.info?.Creator || "no author found", docAuthor: docs[0]?.metadata?.pdf?.info?.Creator || "no author found",
description: "No description found.", description: "No description found.",
docSource: "pdf file uploaded by the user.", docSource: "pdf file uploaded by the user.",
chunkSource: filename, chunkSource: "",
published: createdDate(fullFilePath), published: createdDate(fullFilePath),
wordCount: content.split(" ").length, wordCount: content.split(" ").length,
pageContent: content, pageContent: content,

View File

@ -34,7 +34,7 @@ async function asTxt({ fullFilePath = "", filename = "" }) {
docAuthor: "Unknown", // TODO: Find a better author docAuthor: "Unknown", // TODO: Find a better author
description: "Unknown", // TODO: Find a better description description: "Unknown", // TODO: Find a better description
docSource: "a text file uploaded by the user.", docSource: "a text file uploaded by the user.",
chunkSource: filename, chunkSource: "",
published: createdDate(fullFilePath), published: createdDate(fullFilePath),
wordCount: content.split(" ").length, wordCount: content.split(" ").length,
pageContent: content, pageContent: content,

View File

@ -45,8 +45,8 @@ async function loadGithubRepo(args) {
title: doc.metadata.source, title: doc.metadata.source,
docAuthor: repo.author, docAuthor: repo.author,
description: "No description found.", description: "No description found.",
docSource: repo.repo, docSource: doc.metadata.source,
chunkSource: doc.metadata.source, chunkSource: `link://${doc.metadata.repository}/blob/${doc.metadata.branch}/${doc.metadata.source}`,
published: new Date().toLocaleString(), published: new Date().toLocaleString(),
wordCount: doc.pageContent.split(" ").length, wordCount: doc.pageContent.split(" ").length,
pageContent: doc.pageContent, pageContent: doc.pageContent,

View File

@ -68,7 +68,7 @@ async function loadYouTubeTranscript({ url }) {
docAuthor: metadata.author, docAuthor: metadata.author,
description: metadata.description, description: metadata.description,
docSource: url, docSource: url,
chunkSource: url, chunkSource: `link://${url}`,
published: new Date().toLocaleString(), published: new Date().toLocaleString(),
wordCount: content.split(" ").length, wordCount: content.split(" ").length,
pageContent: content, pageContent: content,

View File

@ -1,22 +1,31 @@
import { memo, useState } from "react"; import { memo, useState } from "react";
import { X } from "@phosphor-icons/react";
import { v4 } from "uuid"; import { v4 } from "uuid";
import { decode as HTMLDecode } from "he"; import { decode as HTMLDecode } from "he";
import { CaretRight, FileText } from "@phosphor-icons/react"; import { CaretRight, FileText } from "@phosphor-icons/react";
import truncate from "truncate"; import truncate from "truncate";
import ModalWrapper from "@/components/ModalWrapper"; import ModalWrapper from "@/components/ModalWrapper";
import { middleTruncate } from "@/utils/directories";
import {
ArrowSquareOut,
GithubLogo,
Link,
X,
YoutubeLogo,
} from "@phosphor-icons/react";
function combineLikeSources(sources) { function combineLikeSources(sources) {
const combined = {}; const combined = {};
sources.forEach((source) => { sources.forEach((source) => {
const { id, title, text } = source; const { id, title, text, chunkSource = "" } = source;
if (combined.hasOwnProperty(title)) { if (combined.hasOwnProperty(title)) {
combined[title].text += `\n\n ---- Chunk ${id || ""} ---- \n\n${text}`; combined[title].text += `\n\n ---- Chunk ${id || ""} ---- \n\n${text}`;
combined[title].references += 1; combined[title].references += 1;
combined[title].chunkSource = chunkSource;
} else { } else {
combined[title] = { title, text, references: 1 }; combined[title] = { title, text, chunkSource, references: 1 };
} }
}); });
return Object.values(combined); return Object.values(combined);
} }
@ -41,10 +50,10 @@ export default function Citations({ sources = [] }) {
/> />
</button> </button>
{open && ( {open && (
<div className="flex flex-wrap md:flex-row flex-col items-center gap-4 overflow-x-scroll mt-1 doc__source"> <div className="flex flex-wrap md:flex-row flex-col md:items-center gap-4 overflow-x-scroll mt-1 doc__source">
{combineLikeSources(sources).map((source) => ( {combineLikeSources(sources).map((source) => (
<Citation <Citation
key={source?.id || v4()} key={v4()}
source={source} source={source}
onClick={() => setSelectedSource(source)} onClick={() => setSelectedSource(source)}
/> />
@ -64,16 +73,18 @@ export default function Citations({ sources = [] }) {
const Citation = memo(({ source, onClick }) => { const Citation = memo(({ source, onClick }) => {
const { title } = source; const { title } = source;
if (!title) return null; if (!title) return null;
const chunkSourceInfo = parseChunkSource(source);
const truncatedTitle = truncateMiddle(title); const truncatedTitle = chunkSourceInfo?.text ?? middleTruncate(title, 25);
const CitationIcon = ICONS.hasOwnProperty(chunkSourceInfo?.icon)
? ICONS[chunkSourceInfo.icon]
: ICONS.file;
return ( return (
<div <div
className="flex flex-row justify-center items-center cursor-pointer text-sky-400" className="w-fit flex flex-row justify-center items-center cursor-pointer text-sky-400"
style={{ width: "24%" }}
onClick={onClick} onClick={onClick}
> >
<FileText className="w-6 h-6" weight="bold" /> <CitationIcon className="w-6 h-6" weight="bold" />
<p className="text-sm font-medium whitespace-nowrap">{truncatedTitle}</p> <p className="text-sm font-medium whitespace-nowrap">{truncatedTitle}</p>
</div> </div>
); );
@ -99,14 +110,31 @@ function SkeletonLine() {
function CitationDetailModal({ source, onClose }) { function CitationDetailModal({ source, onClose }) {
const { references, title, text } = source; const { references, title, text } = source;
const { isUrl, text: webpageUrl, href: linkTo } = parseChunkSource(source);
return ( return (
<ModalWrapper isOpen={source}> <ModalWrapper isOpen={source}>
<div className="w-full max-w-2xl bg-main-gradient rounded-lg shadow border border-white/10 overflow-hidden"> <div className="w-full max-w-2xl bg-main-gradient rounded-lg shadow border border-white/10 overflow-hidden">
<div className="relative p-6 border-b rounded-t border-gray-500/50"> <div className="relative p-6 border-b rounded-t border-gray-500/50">
<div className="w-full flex gap-x-2 items-center">
{isUrl ? (
<a
href={linkTo}
target="_blank"
rel="noreferrer"
className="text-xl font-semibold text-white overflow-hidden overflow-ellipsis whitespace-nowrap hover:underline hover:text-blue-300 flex items-center gap-x-1"
>
<h3 className="flex items-center gap-x-1">
{webpageUrl}
<ArrowSquareOut />
</h3>
</a>
) : (
<h3 className="text-xl font-semibold text-white overflow-hidden overflow-ellipsis whitespace-nowrap"> <h3 className="text-xl font-semibold text-white overflow-hidden overflow-ellipsis whitespace-nowrap">
{truncate(title, 45)} {truncate(title, 45)}
</h3> </h3>
)}
</div>
{references > 1 && ( {references > 1 && (
<p className="text-xs text-gray-400 mt-2"> <p className="text-xs text-gray-400 mt-2">
Referenced {references} times. Referenced {references} times.
@ -141,11 +169,47 @@ function CitationDetailModal({ source, onClose }) {
); );
} }
function truncateMiddle(title) { const ICONS = {
if (title.length <= 18) return title; file: FileText,
link: Link,
youtube: YoutubeLogo,
github: GithubLogo,
};
const startStr = title.substr(0, 9); // Show the correct title and/or display text for citations
const endStr = title.substr(-9); // which contain valid outbound links that can be clicked by the
// user when viewing a citation. Optionally allows various icons
// to show distinct types of sources.
function parseChunkSource({ title = "", chunkSource = "" }) {
const nullResponse = {
isUrl: false,
text: null,
href: null,
icon: "file",
};
return `${startStr}...${endStr}`; if (!chunkSource.startsWith("link://")) return nullResponse;
try {
const url = new URL(chunkSource.split("link://")[1]);
let text = url.host + url.pathname;
let icon = "link";
if (url.host.includes("youtube.com")) {
text = title;
icon = "youtube";
}
if (url.host.includes("github.com")) {
text = title;
icon = "github";
}
return {
isUrl: true,
href: url.toString(),
text,
icon,
};
} catch {}
return nullResponse;
} }