From 17031d8be8737cf0e4a03e79db90ac9470b870fc Mon Sep 17 00:00:00 2001 From: Geoff Seemueller Date: Wed, 30 Oct 2024 11:59:30 -0400 Subject: [PATCH] init --- .gitignore | 3 + README.md | 136 +++++++++++++++++++++++++++++++++++++++ package.json | 15 +++++ pnpm-lock.yaml | 32 +++++++++ src/MarkdownGenerator.js | 98 ++++++++++++++++++++++++++++ src/TokenCleaner.js | 41 ++++++++++++ src/cli.js | 14 ++++ src/index.js | 3 + 8 files changed, 342 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 package.json create mode 100644 pnpm-lock.yaml create mode 100644 src/MarkdownGenerator.js create mode 100644 src/TokenCleaner.js create mode 100755 src/cli.js create mode 100644 src/index.js diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9f83034 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/.idea/ +/node_modules/ +/dist/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..e878cf8 --- /dev/null +++ b/README.md @@ -0,0 +1,136 @@ +# code-tokenizer-md + +Process git repository files into markdown with token counting and sensitive data redaction. + +## Overview + +`code-tokenizer-md` is a Node.js tool that processes git repository files, cleans code, redacts sensitive information, and generates markdown documentation with token counts. + +```mermaid +graph TD + Start[Start] -->|Read| Git[Git Files] + Git -->|Clean| TC[TokenCleaner] + TC -->|Redact| Clean[Clean Code] + Clean -->|Generate| MD[Markdown] + MD -->|Count| Results[Token Counts] + style Start fill:#000000,stroke:#FFFFFF,stroke-width:4px,color:#ffffff + style Git fill:#222222,stroke:#FFFFFF,stroke-width:2px,color:#ffffff + style TC fill:#333333,stroke:#FFFFFF,stroke-width:2px,color:#ffffff + style Clean fill:#444444,stroke:#FFFFFF,stroke-width:2px,color:#ffffff + style MD fill:#555555,stroke:#FFFFFF,stroke-width:2px,color:#ffffff + style Results fill:#666666,stroke:#FFFFFF,stroke-width:2px,color:#ffffff +``` + +## Features + +### Data Processing +- Reads files from git repository +- Removes comments and unnecessary whitespace +- Redacts sensitive information (API keys, tokens, etc.) +- Counts tokens using llama3-tokenizer + +### Analysis Types +- Token counting per file +- Total token usage +- File content analysis +- Sensitive data detection + +### Data Presentation +- Markdown formatted output +- Code block formatting +- Token count summaries +- File organization hierarchy + +## Requirements + +- Node.js (>=14.0.0) +- Git repository +- npm or npx + +## Installation + +```shell +npm install -g code-tokenizer-md +``` + +## Usage + +### Quick Start + +```shell +npx code-tokenizer-md +``` + +### Programmatic Usage + +```javascript +import { MarkdownGenerator } from 'code-tokenizer-md'; + +const generator = new MarkdownGenerator({ + dir: './project', + outputFilePath: './output.md' +}); + +const result = await generator.createMarkdownDocument(); +``` + +## Project Structure + +``` +src/ +├── index.js # Main exports +├── TokenCleaner.js # Code cleaning and redaction +├── MarkdownGenerator.js # Markdown generation logic +└── cli.js # CLI implementation +``` + +## Dependencies + +```json +{ + "dependencies": { + "llama3-tokenizer-js": "^1.0.0" + }, + "peerDependencies": { + "node": ">=14.0.0" + } +} +``` + +## Extending + +### Adding Custom Patterns + +```javascript +const generator = new MarkdownGenerator({ + customPatterns: [ + { regex: /TODO:/g, replacement: '' } + ], + customSecretPatterns: [ + { regex: /mySecret/g, replacement: '[REDACTED]' } + ] +}); +``` + +## Contributing + +1. Fork the repository +2. Create a feature branch +3. Commit your changes +4. Push to the branch +5. Open a Pull Request + +### Contribution Guidelines + +- Follow Node.js best practices +- Include appropriate error handling +- Add documentation for new features +- Include tests for new functionality (this project needs a suite) +- Update the README for significant changes + +## License +MIT © 2024 Geoff Seemueller + +## Note + +This tool requires a git repository to function properly. \ No newline at end of file diff --git a/package.json b/package.json new file mode 100644 index 0000000..1481476 --- /dev/null +++ b/package.json @@ -0,0 +1,15 @@ +{ + "name": "code-tokenizer-md", + "version": "1.0.0", + "type": "module", + "main": "src/index.js", + "bin": { + "code-tokenizer-md": "./src/cli.js" + }, + "dependencies": { + "llama3-tokenizer-js": "^1.0.0" + }, + "peerDependencies": { + "node": ">=14.0.0" + } +} \ No newline at end of file diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml new file mode 100644 index 0000000..43a16c8 --- /dev/null +++ b/pnpm-lock.yaml @@ -0,0 +1,32 @@ +lockfileVersion: '6.0' + +settings: + autoInstallPeers: true + excludeLinksFromLockfile: false + +dependencies: + llama3-tokenizer-js: + specifier: ^1.0.0 + version: 1.2.0 + node: + specifier: '>=14.0.0' + version: 22.11.0 + +packages: + + /llama3-tokenizer-js@1.2.0: + resolution: {integrity: sha512-oMgIgK958UlvoEm3Lz/gAj3QAKpnAMb6YqlY0aTYraSK/c+V3TF3P7IWFQJe4yjM60+2/KoK+EWziec6WQ57/g==} + dev: false + + /node-bin-setup@1.1.3: + resolution: {integrity: sha512-opgw9iSCAzT2+6wJOETCpeRYAQxSopqQ2z+N6BXwIMsQQ7Zj5M8MaafQY8JMlolRR6R1UXg2WmhKp0p9lSOivg==} + dev: false + + /node@22.11.0: + resolution: {integrity: sha512-RIAOdr40k1sq/DYF5u3XmhQHG+FZViuxObe2w1xPmOjEi4AiFgv/XRHW60YydS85X9gc8/jaI9RH4E9nvrV+gQ==} + engines: {npm: '>=5.0.0'} + hasBin: true + requiresBuild: true + dependencies: + node-bin-setup: 1.1.3 + dev: false diff --git a/src/MarkdownGenerator.js b/src/MarkdownGenerator.js new file mode 100644 index 0000000..2d41f33 --- /dev/null +++ b/src/MarkdownGenerator.js @@ -0,0 +1,98 @@ + +// src/MarkdownGenerator.js +import path from 'path'; +import { execSync } from 'child_process'; +import fs from 'fs/promises'; +import llama3Tokenizer from 'llama3-tokenizer-js'; +import { TokenCleaner } from './TokenCleaner.js'; + +export class MarkdownGenerator { + constructor(options = {}) { + this.dir = options.dir || '.'; + this.outputFilePath = options.outputFilePath || './prompt.md'; + this.fileTypeExclusions = new Set(options.fileTypeExclusions || ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp', '.tiff', '.lockb', '.yaml', '.ico', '.ttf', '.css']); + this.fileExclusions = options.fileExclusions || ['prompt.js', '.gitignore', '.env', '.dev.vars']; + this.tokenCleaner = new TokenCleaner(options.customPatterns, options.customSecretPatterns); + this.verbose = options.verbose ?? true; + } + + async getTrackedFiles() { + try { + const output = this.execCommand('git ls-files'); + const trackedFiles = output.split('\n').filter(file => file.length > 0); + if (this.verbose) console.log(`Total tracked files: ${trackedFiles.length}`); + return trackedFiles.filter(file => { + const fileExt = path.extname(file).toLowerCase(); + const isExcluded = this.fileExclusions.some(pattern => this.isFileExcluded(file, pattern)); + return !this.fileTypeExclusions.has(fileExt) && !isExcluded; + }); + } catch (error) { + if (this.verbose) console.error('Error fetching tracked files:', error); + return []; + } + } + + isFileExcluded(filePath, pattern) { + if (pattern.endsWith('/*')) { + const directory = pattern.slice(0, -2); + return filePath.startsWith(directory); + } + if (pattern.includes('/*')) { + const [directory, ext] = pattern.split('/*'); + return filePath.startsWith(directory) && filePath.endsWith(ext); + } + return filePath === pattern; + } + + async readFileContent(filePath) { + try { + const content = await fs.readFile(filePath, 'utf-8'); + const cleanedAndRedactedContent = this.tokenCleaner.cleanAndRedact(content); + if (this.verbose) { + const tokenCount = llama3Tokenizer.encode(cleanedAndRedactedContent).length; + console.log(`${filePath}: Tokens[${tokenCount}]`); + } + return cleanedAndRedactedContent; + } catch (error) { + if (this.verbose) console.error(`Error reading file ${filePath}:`, error); + return ''; + } + } + + async generateMarkdown() { + const trackedFiles = await this.getTrackedFiles(); + if (this.verbose) console.log(`Generating markdown for ${trackedFiles.length} files`); + let markdownContent = '# Project Files\n\n'; + + for (const file of trackedFiles) { + const content = await this.readFileContent(path.join(this.dir, file)); + markdownContent += `## ${file}\n~~~\n${content.trim()}\n~~~\n`; + } + return markdownContent; + } + + async createMarkdownDocument() { + try { + const markdownContent = await this.generateMarkdown(); + await fs.writeFile(this.outputFilePath, markdownContent); + if (this.verbose) { + console.log(`Markdown document created at ${this.outputFilePath}`); + const totalTokens = llama3Tokenizer.encode(markdownContent).length; + console.log({total_tokens: totalTokens}); + } + return { success: true, tokenCount: llama3Tokenizer.encode(markdownContent).length }; + } catch (error) { + if (this.verbose) console.error('Error writing markdown document:', error); + return { success: false, error }; + } + } + + execCommand(command) { + try { + return execSync(command, { cwd: this.dir, encoding: 'utf-8' }).toString().trim(); + } catch (error) { + if (this.verbose) console.error(`Error executing command: ${command}`, error); + throw error; + } + } +} \ No newline at end of file diff --git a/src/TokenCleaner.js b/src/TokenCleaner.js new file mode 100644 index 0000000..dac87d0 --- /dev/null +++ b/src/TokenCleaner.js @@ -0,0 +1,41 @@ +// src/TokenCleaner.js +export class TokenCleaner { + constructor(customPatterns = [], customSecretPatterns = []) { + this.patterns = [ + { regex: /\/\/.*$/gm, replacement: '' }, + { regex: /\/\*[\s\S]*?\*\//gm, replacement: '' }, + { regex: /console\.(log|error|warn|info)\(.*?\);?/g, replacement: '' }, + { regex: /^\s*[\r\n]/gm, replacement: '' }, + { regex: / +$/gm, replacement: '' }, + { regex: /^\s*import\s+.*?;?\s*$/gm, replacement: '' }, + { regex: /^\s*\n+/gm, replacement: '\n' }, + ...customPatterns + ]; + + this.secretPatterns = [ + { regex: /(?<=(['"])(?:api[_-]?key|api[_-]?secret|access[_-]?token|auth[_-]?token|client[_-]?secret|password|secret[_-]?key|private[_-]?key)['"]:\s*['"])[^\'"]+(?=['"])/gi, replacement: '[REDACTED]' }, + { regex: /(?<=(?:api[_-]?key|api[_-]?secret|access[_-]?token|auth[_-]?token|client[_-]?secret|password|secret[_-]?key|private[_-]?key)\s*=\s*['"])[^\'"]+(?=['"])/gi, replacement: '[REDACTED]' }, + { regex: /(?<=bearer\s+)[a-zA-Z0-9\-._~+\/]+=*/gi, replacement: '[REDACTED]' }, + { regex: /(?<=Authorization:\s*Bearer\s+)[a-zA-Z0-9\-._~+\/]+=*/gi, replacement: '[REDACTED]' }, + { regex: /(?<=eyJ)[A-Za-z0-9-_=]+\.eyJ[A-Za-z0-9-_=]+\.[A-Za-z0-9-_.+\/=]*/g, replacement: '[REDACTED_JWT]' }, + { regex: /([a-f0-9]{40}|[a-f0-9]{64})/gi, replacement: '[REDACTED_HASH]' }, + { regex: /(?<=[^A-Za-z0-9]|^)([A-Za-z0-9+\/]{40}|[A-Za-z0-9+\/]{64})(?=[^A-Za-z0-9]|$)/g, replacement: '[REDACTED_BASE64]' }, + ...customSecretPatterns + ]; + } + + clean(code) { + return this.patterns.reduce((cleanCode, pattern) => + cleanCode.replace(pattern.regex, pattern.replacement), code); + } + + redactSecrets(code) { + return this.secretPatterns.reduce((redactedCode, pattern) => + redactedCode.replace(pattern.regex, pattern.replacement), code); + } + + cleanAndRedact(code) { + const cleanedCode = this.clean(code); + return this.redactSecrets(cleanedCode); + } +} \ No newline at end of file diff --git a/src/cli.js b/src/cli.js new file mode 100755 index 0000000..00740dd --- /dev/null +++ b/src/cli.js @@ -0,0 +1,14 @@ +#!/usr/bin/env node +import { MarkdownGenerator } from './MarkdownGenerator.js'; + +const generator = new MarkdownGenerator(); +generator.createMarkdownDocument() + .then(result => { + if (!result.success) { + process.exit(1); + } + }) + .catch(error => { + console.error('Error:', error); + process.exit(1); + }); diff --git a/src/index.js b/src/index.js new file mode 100644 index 0000000..fe2063e --- /dev/null +++ b/src/index.js @@ -0,0 +1,3 @@ +// src/index.js +export { TokenCleaner } from './TokenCleaner.js'; +export { MarkdownGenerator } from './MarkdownGenerator.js'; \ No newline at end of file