Skip to content

Latest commit

 

History

History
320 lines (257 loc) · 13.7 KB

GPT-4-failure.md

File metadata and controls

320 lines (257 loc) · 13.7 KB

Note: This case requires strongly correlated code comprehension and generation. It is part of a chrome extension to do RAG in browser directly. function markElements is generated earlier by GPT-4 without need of manual correction. The loop to merge element text by embedding similarity is generated by GPT-4 with mistakes and manually corrected. Most of the comments are generated by GPT-4 and not very precise. They can be deleted to compare the model feedback.

👨‍💻modify the following function markElements and related code to mark elements belonging to different cluster of mergedTxt with different colors.

function markElements(elements) {
  const styleId = chrome.runtime.id+'_highlight';
  // 检查是否已经添加了样式
  if (!document.getElementById(styleId)) {
    // 如果没有,创建一个新的style元素并添加到head中
    const style = document.createElement('style');
    style.id = styleId; // 设置一个唯一的ID
    document.head.appendChild(style);
    style.sheet.insertRule(`
      .highlighted-element {
        border: 2px dashed red !important; /* 使用红色虚线边框,!important 确保覆盖其他边框样式 */
      }
    `, 0);
  }

  // 为每个元素添加这个样式类
  elements.forEach(element => {
    element.classList.add('highlighted-element');
  });
}

function findContentElements() {
  const elements = document.querySelectorAll('p, span, li, h1, h2, h3, h4, h5, h6, pre, code, b, i, u'/*, section, article'*/); //similar to https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules.html#htmlnodeparser
  const largeElements = [];

  elements.forEach(el => {
    const area = el.offsetWidth * el.offsetHeight;
    if (area > 6000) {
      let isDescendant = false;
      for (let parentElement of largeElements) {
        if (parentElement.contains(el)) {
          isDescendant = true;
          break;
        }
      }
      if (!isDescendant) {
        largeElements.push(el);
      }
    }
  });

  return largeElements;
}

function main(){
  const elems = findContentElements();
  markElements(elems);
  const txts = elems.map(el => (el.textContent || el.innerText).trim());

  console.log("chunking text of elements "+txts.length);
  (async function () {
    const tokenizer = await LLMSingleton.getTokenizer();
    async function lengthFunction(text){
      const { input_ids } = await tokenizer(text);
      return input_ids.size;
    }
    let start = performance.now();
    let embeddings = await WebGPUSingleton.infer(txts);
    let end = performance.now();
    //console.log(embeddings);
    console.log(`WebGPU Batch Execution time: ${end - start} ms`);
    //前后句子之cos_sim,小于一定值的组合到不超过max_seq_length
    start = performance.now();
    let pvector;
    const scores = embeddings.tolist().map(
      (embedding, i) => {
        if (i>0)
          return 1 - cos_sim(pvector, embedding);
        pvector = embedding;
        return 0;
      }
    );
    end = performance.now();
    console.log(`Score calculation time: ${end - start} ms`);
    scores.shift();
    //console.log(txts, scores);
    const threshold = percentile(75,scores);
    console.log(threshold);
    const max_seq_length = LLMSingleton.getMaxSeqLength();
    let mergedTxts = [];
    let brk_score=0, brk_length=0;
    let i = 0;
    while (i < txts.length) {
      let currentTxt = txts[i];
  
      // Attempt to merge the current chunk with the next one if conditions are met
      while (i < scores.length && scores[i] < threshold) {
        const nextTxtContent = (i + 1 < txts.length) ? txts[i + 1] : "";
        const concatenatedContent = currentTxt + "\n"+ nextTxtContent;
        const concatenatedLength = await lengthFunction(concatenatedContent);
  
        if (concatenatedLength < max_seq_length) {
          currentTxt = concatenatedContent;
          i++; // Move to the next chunk as it has been merged
        } else {
          brk_length++;
          break; // If concatenated length exceeds max_seq_length, stop trying to merge further
        }
      }
      if (i < scores.length && scores[i] >= threshold) brk_score++;
  
      mergedTxts.push(currentTxt); // Add the merged/current document to mergedTxts
  
      i++; // If no merge happened, move to the next document
    }
    console.log(mergedTxts);
    console.log(`break by score:${brk_score}, break by length:${brk_length}`);
        
    let vs = await WebGPUSingleton.infer(mergedTxts);
    vs = vs.tolist();
    let kmeans = new KMEANS();
    // parameters: 3 - number of clusters
    let clusters = kmeans.run(vs, 2);
    clusters.sort(function(a, b) {
      return Math.max(...(b.map(i => mergedTxts[i].length))) - Math.max(...(a.map(i => mergedTxts[i].length)));
    });
    console.log(clusters);
    for (let c=0; c<clusters.length;c++){
      for (let i =0; i<Math.min(clusters[c].length,5);i++)
        console.log(`c${c+1} ${i+1}/${clusters[c].length} ${mergedTxts[clusters[c][i]]}`)
    }
  })();
}

❌GPT-4

To modify the markElements function and related code to mark elements belonging to different clusters of mergedTxt with different colors, we need to update the code to:

  1. Assign different classes with different border colors to elements based on their cluster.
  2. Update the markElements function to accept a second parameter that specifies the color to use for highlighting.
  3. After clustering, apply the updated markElements function to each cluster with a unique color.

Here's how you can modify the code:

function markElements(elements, color) {
  const styleId = chrome.runtime.id+'_highlight';
  // Check if the style element already exists
  if (!document.getElementById(styleId)) {
    const style = document.createElement('style');
    style.id = styleId;
    document.head.appendChild(style);
  }
  
  // Dynamically insert a rule for the specified color if it doesn't already exist
  if (!document.getElementById(styleId).sheet.cssRules.length || Array.from(document.getElementById(styleId).sheet.cssRules).every(rule => !rule.selectorText.includes(`highlighted-${color}`))) {
    document.getElementById(styleId).sheet.insertRule(`
      .highlighted-${color} {
        border: 2px dashed ${color} !important;
      }
    `, document.getElementById(styleId).sheet.cssRules.length);
  }

  // Add the color-specific class to each element
  elements.forEach(element => {
    element.classList.add(`highlighted-${color}`);
  });
}

// Assume this function returns the elements corresponding to each merged text
function findElementsForMergedTexts(mergedTexts) {
  // This is a placeholder; you'll need to implement the logic to find and return 
  // elements corresponding to the merged texts. This could involve storing a reference
  // to the original elements when you initially find them, or some form of matching 
  // between the text content and elements.
}

async function main() {
  const elems = findContentElements();
  const txts = elems.map(el => (el.textContent || el.innerText).trim());

  console.log("chunking text of elements "+txts.length);
  // Existing code to chunk texts and calculate embeddings

  // After clustering
  let clusterColors = ['red', 'blue', 'green']; // Add more colors if more clusters
  for (let c = 0; c < clusters.length; c++) {
    let elementsForCluster = findElementsForMergedTexts(clusters[c].map(index => mergedTxts[index]));
    markElements(elementsForCluster, clusterColors[c % clusterColors.length]);
  }
}

Key changes and considerations:

  • The markElements function now accepts a color parameter to dynamically create and apply styles based on the cluster. This means you can call markElements with a different color for each cluster.
  • I added a placeholder function findElementsForMergedTexts that you need to implement. This function should map the merged texts back to their corresponding elements in the DOM. This mapping is crucial for highlighting the correct elements and might require additional logic not shown here.
  • The color assignment uses a simple array clusterColors. You should extend this array with more colors if you expect more than three clusters.
  • Ensure that your method of mapping merged texts to elements accurately reflects their presence in the DOM. This could be the most challenging part, depending on how mergedTxts are generated and how they correlate to the original elements.

👨‍💻implement the function findElementsForMergedTexts

❌GPT-4

Implementing the function findElementsForMergedTexts to map merged texts back to their corresponding elements in the DOM requires a strategy for tracking the relationship between texts and elements. Since the mergedTxts are created by combining the text content of multiple elements, you need a way to remember which elements contributed to each merged text. One approach is to use a data structure to map each merged text to its original elements during the merging process.

Here is an implementation outline that includes modifications to your existing code to support this mapping, followed by the findElementsForMergedTexts function:

Step 1: Modify the Merging Process to Track Elements

First, modify your text merging process to keep track of which elements contribute to each merged text. This involves creating a mapping structure that associates each merged text with a list of its original DOM elements.

// Placeholder for the mapping structure: maps merged text index to its original elements
let mergedTextToElementsMap = new Map();

// Modify your merging process to fill this map
// Example (this should be integrated into your existing merging logic):
mergedTxts.forEach((mergedTxt, index) => {
  let originalElements = []; // You would populate this based on your merging logic
  mergedTextToElementsMap.set(index, originalElements);
});

Step 2: Implement findElementsForMergedTexts

Given the mapping structure, findElementsForMergedTexts can now use it to find the corresponding elements for each merged text:

function findElementsForMergedTexts(mergedTextIndices) {
  let elements = [];
  mergedTextIndices.forEach(index => {
    if (mergedTextToElementsMap.has(index)) {
      elements = elements.concat(mergedTextToElementsMap.get(index));
    }
  });
  return elements;
}

Integrating the Mapping into Your Merging Logic

The critical part of this implementation is accurately populating mergedTextToElementsMap during the text merging process. This requires you to adjust your merging logic to track not just the texts but also which DOM elements those texts came from.

Here's a conceptual example of how you might adjust the merging part to track the elements:

let currentTextElements = []; // Track elements for the current merged text

// When starting a new merged text
currentTextElements = [element]; // Start with the current element

// When adding another text to the current merged text
currentTextElements.push(nextElement); // Add the next element to the list

// After finalizing a merged text
mergedTextToElementsMap.set(mergedTxts.length, currentTextElements); // Use the current length of mergedTxts as the index

This example is simplified. Your actual implementation will need to integrate with your logic for deciding when to merge texts based on the tokenizer's output, the scores, and the max_seq_length. The key is to maintain a list of elements that are being merged together and to store this list in mergedTextToElementsMap once the merged text is finalized.

❌Gemini Pro

Here's the modified code with explanations:

function markElements(elements, clusterAssignments) {
  const numClusters = getMaxClusterId(clusterAssignments); // Determine the number of clusters
  const colors = generateDistinctColors(numClusters); // Generate distinct colors for each cluster

  for (let i = 0; i < elements.length; i++) {
    const clusterId = clusterAssignments[i];
    const color = colors[clusterId];

    const style = document.createElement('style');
    style.id = `highlight-${i}`; // Assign a unique ID to avoid conflicts
    document.head.appendChild(style);

    style.sheet.insertRule(`
      .highlighted-element-${i} {
        border: 2px dashed ${color} !important;
      }
    `, 0);

    elements[i].classList.add(`highlighted-element-${i}`); // Add the unique class for styling
  }
}

// ... other functions unchanged ...

function main() {
  // ... other code unchanged ...

  clusters.sort(function(a, b) {
    // ... sorting logic unchanged ...
  });

  markElements(elems, kmeans.labels); // Pass the cluster assignments to markElements

  // ... rest of the code unchanged ...
}

// Helper functions:

function getMaxClusterId(clusterAssignments) {
  return Math.max(...clusterAssignments);
}

function generateDistinctColors(numColors) {
  // Implement your color generation logic here, e.g., using a color library
  // Example using a basic color palette:
  const colors = ['red', 'green', 'blue', 'purple', 'orange', 'yellow'];
  return colors.slice(0, numColors);
}

Key modifications:

  1. Passing cluster assignments: The markElements function now accepts clusterAssignments, an array indicating the cluster each element belongs to.
  2. Generating colors: The generateDistinctColors function (replace with your preferred color generation logic) creates an array of distinct colors based on the number of clusters.
  3. Unique styling rules: Each element is assigned a unique class and style rule to prevent conflicts and ensure proper coloring.
  4. Calling markElements with assignments: In the main function, the markElements function is called with kmeans.labels (which holds the cluster assignments) after clustering.

This code ensures that elements belonging to different clusters are highlighted with distinct colors, visually representing their groupings.