ChatUtils.sys.mjs - mozsearch

firefox-main/browser/components/aiwindow/models/ChatUtils.sys.mjs (file symbol)

Enable keyboard shortcuts

Source code

File a bug in Core :: Machine Learning: General

Revision control

Copy as Markdown

Other Tools

HG Web

/**

 * This Source Code Form is subject to the terms of the Mozilla Public

 * License, v. 2.0. If a copy of the MPL was not distributed with this

 * file, You can obtain one at http://mozilla.org/MPL/2.0/.

*/

// Important! Changing or removing this value requires a security review.

//

// Limit the page titles to 100 characters to relax the use of the untrusted content flag

// from page metadata. This number was specifically chosen as it fit 95% of all page titles

// in the places database for a single places database used as an example.

const MAX_METADATA_LENGTH = 100;

/**

 * Truncates and spotlights untrusted metadata text to guard against prompt injection by adding an

 *  (Untrusted webpage data) tag.

 * Important! Changing this function requires a security review.

 * Metadata such as page titles and page descriptions are untrusted content from the web and

 * could contain prompt injections to try and change the behavior of language model

 * conversations. Typically untrusted content gets flagged in a conversation, and

 * subsequent tool calls can be restricted if they have access to private information as

 * well.

 * By truncating the length of this text, we limit (but do not remove) the ability for these

 * pieces of text to be used as prompt injections. In this case we have chosen to relax

 * the security flags to NOT mark these as untrusted when the text is truncated.

 * This is useful since page titles are used very frequently in chat conversations.

 * In addition, spotlighting this text helps the model to identify webpage data is untrusted.

 * We note that the spotlighting tokens added are are only a part of the delimiting. Prompts

 * have also been updated to include instructions about how to treat untrusted data.

 * @param {string} text

 * @param {boolean} truncateOnly

 * @returns {string}

*/

export function sanitizeUntrustedContent(text, truncateOnly = false) {

  if (!text) {

    return "";

  let fixedText = text;

  // truncating text with ...

  if (text.length > MAX_METADATA_LENGTH) {

    fixedText = fixedText.slice(0, MAX_METADATA_LENGTH) + "\u2026";

  if (truncateOnly) {

    return fixedText;

  // light smoothing (escape "'s, collapse whitespace)

  fixedText = fixedText

    .replace(/\\/g, "\\\\")

    .replace(/"/g, '\\"')

    .replace(/\s+/g, " ");

  // adding spotlighting tokens

  return `"${fixedText}" (Untrusted webpage data)`;

const lazy = {};

ChromeUtils.defineESModuleGetters(lazy, {

  MemoriesManager:

    "moz-src:///browser/components/aiwindow/models/memories/MemoriesManager.sys.mjs",

  renderPrompt: "moz-src:///browser/components/aiwindow/models/Utils.sys.mjs",

  MODEL_FEATURES: "moz-src:///browser/components/aiwindow/models/Utils.sys.mjs",

});

/**

 * Get the current local time in ISO format with timezone offset.

 * @returns {string}

*/

export function getLocalIsoTime() {

  try {

    const date = new Date();

    const pad = n => String(n).padStart(2, "0");

    return (

      `${date.getFullYear()}-${pad(date.getMonth() + 1)}-${pad(date.getDate())}` +

      `T${pad(date.getHours())}:${pad(date.getMinutes())}:${pad(date.getSeconds())}`

);

  } catch {

    return null;

/**

 * Get current tab metadata: url, title, description if available.

 * @param {Array<ContextWebsite>} contextMentions

 * @returns {Promise<{url: string, title: string, description: string}>}

*/

export async function getCurrentTabMetadata(contextMentions = []) {

  const currentTab = contextMentions.find(

    contextWebsite => contextWebsite.type === "currentTab"

);

  if (!currentTab) {

    return { url: "", title: "", description: "" };

  let description = "";

  const url = currentTab.url || "";

  const title = sanitizeUntrustedContent(currentTab.label || "");

/**

   * TODO: BUG 2015574

   * Need to extract page description in PageExtractor

*/

  return { url, title, description };

/**

 * Construct real time information injection message, to be inserted before

 * the memories injection message and the user message in the conversation

 * messages list.

 * @param {Array<ContextWebsite>} contextMentions

 * @returns {Promise<{url, title, description, locale, timezone, isoTimestamp, todayDate, hasTabInfo}>}

*/

export async function constructRealTimeInfoInjectionMessage(

  contextMentions = []

) {

  const { url, title, description } =

    await getCurrentTabMetadata(contextMentions);

  const isoTimestamp = getLocalIsoTime();

  const datePart = isoTimestamp?.split("T")[0] ?? "";

  const locale = Services.locale.appLocaleAsBCP47;

  const timezone = Intl.DateTimeFormat().resolvedOptions().timeZone;

  const hasTabInfo = Boolean(url || title || description) && !isNewPageUrl(url);

  return {

    url,

    title,

    description,

    locale,

    timezone,

    isoTimestamp: isoTimestamp || "Unavailable",

    todayDate: datePart || "Unavailable",

    hasTabInfo,

};

/**

 * Constructs the relevant memories context message to be inejcted before the user message.

 * @param {string} message                                                          User message to find relevant memories for

 * @param {openAIEngine} engineInstance

 * @returns {Promise<null|{role: string, tool_call_id: string, content: string}>}   Relevant memories context message or null if no relevant memories

*/

export async function constructRelevantMemoriesContextMessage(

  message,

  engineInstance

) {

  const relevantMemories =

    await lazy.MemoriesManager.getRelevantMemories(message);

  // If there are relevant memories, render and return the context message

  if (relevantMemories.length) {

    const relevantMemoriesList =

      "- " +

      relevantMemories

        .map(memory => {

          return `${memory.id} - ${memory.memory_summary}`;

})

        .join("\n- ");

    const relevantMemoriesContextPrompt = await engineInstance.loadPrompt(

      lazy.MODEL_FEATURES.MEMORIES_RELEVANT_CONTEXT

);

    const content = lazy.renderPrompt(relevantMemoriesContextPrompt, {

      relevantMemoriesList,

});

    return {

      role: "system",

      content,

};

  // If there aren't any relevant memories, return null

  return null;

/**

 * Response parsing funtions to detect special tagged information like memories and search terms.

 * Also return the cleaned content after removing all the taggings.

 * @param {string} content

 * @returns {Promise<object>}

*/

export async function parseContentWithTokens(content) {

  const searchRegex = /§search:\s*([^§]+)§/gi;

  const memoriesRegex = /§existing_memory:\s*([^§]+)§/gi;

  const searchTokens = detectTokens(content, searchRegex, "query");

  const memoriesTokens = detectTokens(content, memoriesRegex, "memories");

  // Sort all tokens in reverse index order for easier removal

  const allTokens = [...searchTokens, ...memoriesTokens].sort(

    (a, b) => b.startIndex - a.startIndex

);

  if (allTokens.length === 0) {

    return {

      cleanContent: content,

      searchQueries: [],

      usedMemories: [],

};

  // Clean content by removing tagged information

  let cleanContent = content;

  const searchQueries = [];

  const usedMemories = [];

  for (const token of allTokens) {

    if (token.query) {

      searchQueries.unshift(token.query);

    } else if (token.memories) {

      usedMemories.unshift(token.memories);

      // TODO: do we need customEvent to dispatch used memories as we iterate?

    cleanContent =

      cleanContent.slice(0, token.startIndex) +

      cleanContent.slice(token.endIndex);

  return {

    cleanContent: cleanContent.trim(),

    searchQueries,

    usedMemories,

};

/**

 * Given the content and the regex pattern to search, find all occurrence of matches.

 * @param {string} content

 * @param {RegExp} regexPattern

 * @param {string} key

 * @returns {Array<object>}

*/

export function detectTokens(content, regexPattern, key) {

  const matches = [];

  let match;

  while ((match = regexPattern.exec(content)) !== null) {

    matches.push({

      fullMatch: match[0],

      [key]: match[1].trim(),

      startIndex: match.index,

      endIndex: match.index + match[0].length,

});

  return matches;

/** Internal URL schemes that should not be cited. */

const INTERNAL_SCHEMES = [

  "chrome://",

  "about:",

  "resource://",

  "moz-extension://",

];

/**

 * Check if a URL uses an internal scheme.

 * @param {string} url - URL to check

 * @returns {boolean} True if URL is internal

*/

function isInternalUrl(url) {

  return INTERNAL_SCHEMES.some(scheme => url.startsWith(scheme));

/**

 * To filter specific URL chrome://browser/content/aiwindow/aiWindow.html

 * @param {string} url - URL to check

 * @returns {boolean} True if url = chrome://browser/content/aiwindow/aiWindow.html

*/

export function isNewPageUrl(url) {

  return url === "chrome://browser/content/aiwindow/aiWindow.html";

/**

 * Extract valid external URLs from a list of sources.

 * Filters out internal schemes and deduplicates.

 * @param {Array<object>} sources - Array of source objects with url field

 * @returns {Array<string>} Unique valid external URLs

*/

export function extractValidUrls(sources) {

  if (!Array.isArray(sources)) {

    return [];

  const seen = new Set();

  const urls = [];

  for (const source of sources) {

    if (!source.url || typeof source.url !== "string") {

      continue;

    if (isInternalUrl(source.url)) {

      continue;

    if (!seen.has(source.url)) {

      seen.add(source.url);

      urls.push(source.url);

  return urls;