Integrate file exclusion with micromatch, refactor code

Utilized micromatch for file exclusion logic in MarkdownGenerator. Simplified constructor parameters and refactored `getTrackedFiles` for better readability. Updated dependencies and relevant scripts in package.json.
This commit is contained in:
2024-11-21 13:02:22 -05:00
parent a875cbf5d4
commit 1226a742b5
4 changed files with 94 additions and 95 deletions

View File

@@ -1,6 +1,6 @@
{
"name": "code-tokenizer-md",
"version": "1.0.7",
"version": "1.0.8",
"type": "module",
"main": "dist/index.js",
"bin": {
@@ -13,15 +13,16 @@
"build": "rm -rf dist && mkdir dist && cp src/*.js dist/",
"test": "echo \"No tests specified\" && exit 0",
"prepublishOnly": "npm run build",
"dev": "node ./src/cli.js",
"deploy:dev": "pnpm publish .",
"dev": "npx .",
"deploy:dev": "pnpm build && pnpm publish .",
"lint": "eslint src/",
"lint:fix": "eslint src/ --fix",
"format": "prettier --write \"**/*.{js,jsx,ts,tsx,json,md,yml,yaml}\"",
"fix": "pnpm format && pnpm lint:fix"
},
"dependencies": {
"llama3-tokenizer-js": "^1.0.0"
"llama3-tokenizer-js": "^1.0.0",
"micromatch": "^4.0.8"
},
"peerDependencies": {
"node": ">=14.0.0"

42
pnpm-lock.yaml generated
View File

@@ -8,6 +8,9 @@ dependencies:
llama3-tokenizer-js:
specifier: ^1.0.0
version: 1.2.0
micromatch:
specifier: ^4.0.8
version: 4.0.8
node:
specifier: '>=14.0.0'
version: 22.11.0
@@ -174,6 +177,13 @@ packages:
concat-map: 0.0.1
dev: true
/braces@3.0.3:
resolution: {integrity: sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==}
engines: {node: '>=8'}
dependencies:
fill-range: 7.1.1
dev: false
/callsites@3.1.0:
resolution: {integrity: sha512-P8BjAsXvZS+VIDUI11hHCQEv74YT67YUi5JJFNWIqL235sBmjX4+qx9Muvls5ivyNENctx46xQLQ3aTuE7ssaQ==}
engines: {node: '>=6'}
@@ -351,6 +361,13 @@ packages:
flat-cache: 4.0.1
dev: true
/fill-range@7.1.1:
resolution: {integrity: sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==}
engines: {node: '>=8'}
dependencies:
to-regex-range: 5.0.1
dev: false
/find-up@5.0.0:
resolution: {integrity: sha512-78/PXT1wlLLDgTzDs7sjq9hzz0vXD+zn+7wypEe4fXQxCmdmqfGsEPQxmiCSQI3ajFV91bVSsvNtrJRiW6nGng==}
engines: {node: '>=10'}
@@ -423,6 +440,11 @@ packages:
is-extglob: 2.1.1
dev: true
/is-number@7.0.0:
resolution: {integrity: sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==}
engines: {node: '>=0.12.0'}
dev: false
/isexe@2.0.0:
resolution: {integrity: sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==}
dev: true
@@ -475,6 +497,14 @@ packages:
resolution: {integrity: sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==}
dev: true
/micromatch@4.0.8:
resolution: {integrity: sha512-PXwfBhYu0hBCPw8Dn0E+WDYb7af3dSLVWKi3HGv84IdF4TyFoC0ysxFd0Goxw7nSv4T/PzEJQxsYsEiFCKo2BA==}
engines: {node: '>=8.6'}
dependencies:
braces: 3.0.3
picomatch: 2.3.1
dev: false
/minimatch@3.1.2:
resolution: {integrity: sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==}
dependencies:
@@ -545,6 +575,11 @@ packages:
engines: {node: '>=8'}
dev: true
/picomatch@2.3.1:
resolution: {integrity: sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==}
engines: {node: '>=8.6'}
dev: false
/prelude-ls@1.2.1:
resolution: {integrity: sha512-vkcDPrRZo1QZLbn5RLGPpg/WmIQ65qoWWhcGKf/b5eplkkarX0m9z8ppCat4mlOqUsWpyNuYgO3VRyrYHSzX5g==}
engines: {node: '>= 0.8.0'}
@@ -594,6 +629,13 @@ packages:
resolution: {integrity: sha512-N+8UisAXDGk8PFXP4HAzVR9nbfmVJ3zYLAWiTIoqC5v5isinhr+r5uaO8+7r3BMfuNIufIsA7RdpVgacC2cSpw==}
dev: true
/to-regex-range@5.0.1:
resolution: {integrity: sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==}
engines: {node: '>=8.0'}
dependencies:
is-number: 7.0.0
dev: false
/type-check@0.4.0:
resolution: {integrity: sha512-XleUoc9uwGXqjWwXaUTZAmzMcFZ5858QA2vvx1Ur5xIcixXIP+8LnFDgRplU30us6teqdlskFfu+ae4K79Ooew==}
engines: {node: '>= 0.8.0'}

View File

@@ -1,9 +1,11 @@
// MarkdownGenerator.js
import path from 'path';
import { execSync } from 'child_process';
import fs from 'fs/promises';
import { readFile, writeFile } from 'fs/promises';
import llama3Tokenizer from 'llama3-tokenizer-js';
import { TokenCleaner } from './TokenCleaner.js';
import micromatch from 'micromatch';
/**
* @typedef {Object} MarkdownGeneratorOptions
@@ -24,14 +26,7 @@ import { TokenCleaner } from './TokenCleaner.js';
export class MarkdownGenerator {
/**
* Creates an instance of MarkdownGenerator.
* @param {Object} [options={}] - Configuration options for the generator
* @param {string} [options.dir='.'] - The directory to process files from
* @param {string} [options.outputFilePath='./prompt.md'] - Path where the output markdown file will be saved
* @param {Set<string>} [options.fileTypeExclusions] - Set of file extensions to exclude (defaults to common image and asset files)
* @param {string[]} [options.fileExclusions] - Array of specific files or patterns to exclude
* @param {Object} [options.customPatterns] - Custom patterns for token cleaning
* @param {Object} [options.customSecretPatterns] - Custom patterns for identifying and redacting secrets
* @param {boolean} [options.verbose=true] - Whether to log detailed information during processing
* @param {MarkdownGeneratorOptions} [options={}] - Configuration options for the generator
*/
constructor(options = {}) {
this.dir = options.dir || '.';
@@ -110,7 +105,7 @@ export class MarkdownGenerator {
'**/jsconfig.json',
'**/jsconfig*.json',
'**/package-lock.json',
'**/.prettierignore',
// Environment and variables
'**/.env*',
'**/*.vars',
@@ -171,9 +166,11 @@ export class MarkdownGenerator {
'**/temp/',
'**/*.log'
];
this.tokenCleaner = new TokenCleaner(options.customPatterns, options.customSecretPatterns);
this.verbose = options.verbose ?? true;
this.verbose = options.verbose !== undefined ? options.verbose : true;
}
/**
* Retrieves a list of files tracked by Git, excluding those specified in fileTypeExclusions and fileExclusions.
* @async
@@ -183,15 +180,21 @@ export class MarkdownGenerator {
async getTrackedFiles() {
try {
const output = this.execCommand('git ls-files');
const trackedFiles = output.split('\n').filter(file => file.length > 0);
const trackedFiles = output.split('\n').filter(file => file.trim().length > 0);
if (this.verbose) {
console.log(`Total tracked files: ${trackedFiles.length}`);
}
return trackedFiles.filter(file => {
// Use micromatch to filter out excluded files
const filteredFiles = trackedFiles.filter(file => {
const fileExt = path.extname(file).toLowerCase();
const isExcluded = this.fileExclusions.some(pattern => this.isFileExcluded(file, pattern));
return !this.fileTypeExclusions.has(fileExt) && !isExcluded;
return !this.fileTypeExclusions.has(fileExt) && !micromatch.isMatch(file, this.fileExclusions, { dot: true });
});
if (this.verbose) {
const excludedCount = trackedFiles.length - filteredFiles.length;
console.log(`Excluded files: ${excludedCount}`);
console.log(`Files to process after exclusions: ${filteredFiles.length}`);
}
return filteredFiles;
} catch (error) {
if (this.verbose) {
console.error('Error fetching tracked files:', error);
@@ -199,72 +202,7 @@ export class MarkdownGenerator {
return [];
}
}
/**
* Determines if a file should be excluded based on the given pattern.
* @param {string} filePath - Path of the file to check
* @param {string} pattern - Exclusion pattern to match against
* @returns {boolean} True if the file should be excluded, false otherwise
* @example
* // Excludes all files in a directory
* isFileExcluded('src/tests/file.js', 'src/tests/*') // returns true
* // Excludes specific file extensions in a directory
* isFileExcluded('src/assets/image.png', 'src/assets/*.png') // returns true
*/
isFileExcluded(filePath, pattern) {
// Normalize paths to use forward slashes
filePath = filePath.replace(/\\/g, '/');
pattern = pattern.replace(/\\/g, '/');
// Handle directory-only patterns (ending with /)
if (pattern.endsWith('/')) {
const directory = pattern.slice(0, -1);
if (directory.startsWith('**/')) {
const dirToMatch = directory.slice(3);
return filePath.includes(`${dirToMatch}/`);
}
return filePath.startsWith(directory + '/');
}
// Handle brace expansion for extensions {js,ts}
if (pattern.includes('{') && pattern.includes('}')) {
const [basePath, extensionsGroup] = pattern.split('{');
const extensions = extensionsGroup.slice(0, -1).split(',');
return extensions.some(ext =>
this.isFileExcluded(filePath, basePath + ext)
);
}
// Handle directory/*.ext patterns (extension wildcards in specific directories)
if (pattern.includes('/*.')) {
const [directory, ext] = pattern.split('/*.');
const relativePath = filePath.slice(directory.length + 1);
return filePath.startsWith(directory + '/') &&
!relativePath.includes('/') &&
filePath.endsWith('.' + ext);
}
// Handle pure extension wildcards (*.ext)
if (pattern.startsWith('*.')) {
return filePath.endsWith(pattern.slice(1));
}
// Convert glob pattern to regex for remaining cases
const regexPattern = pattern
// Escape special regex characters except * and ?
.replace(/[.+^${}()|[\]\\]/g, '\\$&')
// Replace ** with special placeholder
.replace(/\*\*/g, '{{GLOBSTAR}}')
// Replace * with regex pattern
.replace(/\*/g, '[^/]*')
// Replace globstar placeholder with proper regex
.replace(/{{GLOBSTAR}}/g, '.*')
// Anchor the regex
.replace(/^/, '^')
.replace(/$/, '$');
const regex = new RegExp(regexPattern);
return regex.test(filePath);
}
/**
* Reads and processes the content of a file, cleaning and redacting sensitive information.
* @async
@@ -274,7 +212,7 @@ export class MarkdownGenerator {
*/
async readFileContent(filePath) {
try {
const content = await fs.readFile(filePath, 'utf-8');
const content = await readFile(filePath, 'utf-8');
const cleanedAndRedactedContent = this.tokenCleaner.cleanAndRedact(content);
if (this.verbose) {
const tokenCount = llama3Tokenizer.encode(cleanedAndRedactedContent).length;
@@ -288,6 +226,7 @@ export class MarkdownGenerator {
return '';
}
}
/**
* Generates markdown content from all tracked files in the project.
* @async
@@ -302,11 +241,17 @@ export class MarkdownGenerator {
let markdownContent = '# Project Files\n\n';
for (const file of trackedFiles) {
const content = await this.readFileContent(path.join(this.dir, file));
markdownContent += `## ${file}\n~~~\n${content.trim()}\n~~~\n`;
const absolutePath = path.join(this.dir, file);
const content = await this.readFileContent(absolutePath);
if (content.trim()) { // Only include files with content after cleaning
markdownContent += `## ${file}\n~~~\n${content.trim()}\n~~~\n\n`;
} else if (this.verbose) {
console.log(`Skipping ${file} as it has no content after cleaning.`);
}
}
return markdownContent;
}
/**
* Retrieves the content of the project's todo file, creating it if it doesn't exist.
* @async
@@ -314,19 +259,28 @@ export class MarkdownGenerator {
* @throws {Error} When unable to read or create the todo file
*/
async getTodo() {
const todoPath = path.join(this.dir, 'todo');
try {
if (this.verbose) {
console.log('Reading todo file');
return await readFile('./todo', 'utf-8');
}
return await readFile(todoPath, 'utf-8');
} catch (error) {
if (error.code === 'ENOENT') {
// File does not exist
if (this.verbose) {
console.log("File not found, creating a new 'todo' file.");
await writeFile('./todo', ''); // Create an empty 'todo' file
return this.getTodo(); // Call the function again
}
await writeFile(todoPath, ''); // Create an empty 'todo' file
return await this.getTodo(); // Await the recursive call
}
if (this.verbose) {
console.error('Error reading todo file:', error);
}
throw error;
}
}
/**
* Creates a complete markdown document combining code documentation and todos.
* @async
@@ -340,8 +294,8 @@ export class MarkdownGenerator {
try {
const codeMarkdown = await this.generateMarkdown();
const todos = await this.getTodo();
const markdown = codeMarkdown + `\n---\n${todos}\n`;
await fs.writeFile(this.outputFilePath, markdown);
const markdown = codeMarkdown + `\n---\n\n${todos}\n`;
await writeFile(this.outputFilePath, markdown);
if (this.verbose) {
console.log(`Markdown document created at ${this.outputFilePath}`);
const totalTokens = llama3Tokenizer.encode(markdown).length;
@@ -355,6 +309,7 @@ export class MarkdownGenerator {
return { success: false, error };
}
}
/**
* Executes a shell command in the specified directory.
* @param {string} command - Shell command to execute

View File

@@ -1,4 +1,5 @@
#!/usr/bin/env node
console.log("RUNNING TOKENIZER")
import { MarkdownGenerator } from './MarkdownGenerator.js';
const generator = new MarkdownGenerator();