jnorthrup commited on
Commit
14daa4c
·
verified ·
1 Parent(s): ad1a52e

Upload 12 files

Browse files
Files changed (12) hide show
  1. 1 +21 -0
  2. 2ocr.sh +32 -0
  3. aidocs.py +155 -0
  4. jpegdir.py +98 -0
  5. random/index.html +100 -0
  6. shove.sh +38 -0
  7. showfiles +98 -0
  8. skel.py +143 -0
  9. summ +0 -0
  10. summarize2 +415 -0
  11. tetris32b.html +275 -0
  12. vttclean.py +74 -0
1 ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ FAILURE: Build failed with an exception.
3
+
4
+ * What went wrong:
5
+ Directory '/Users/jim/work/hacks' does not contain a Gradle build.
6
+
7
+ A Gradle build's root directory should contain one of the possible settings files: settings.gradle, settings.gradle.kts, settings.gradle.dcl.It may also contain one of the possible build files: build.gradle, build.gradle.kts, build.gradle.dcl.
8
+
9
+ To create a new Gradle build in this directory run 'gradle init'
10
+
11
+ For more information about the 'init' task, please refer to https://docs.gradle.org/8.12-rc-1/userguide/build_init_plugin.html in the Gradle documentation.
12
+
13
+ For more details on creating a Gradle build, please refer to https://docs.gradle.org/8.12-rc-1/userguide/tutorial_using_tasks.html in the Gradle documentation.
14
+
15
+ * Try:
16
+ > Run gradle init to create a new Gradle build in this directory.
17
+ > Run with --stacktrace option to get the stack trace.
18
+ > Run with --info or --debug option to get more log output.
19
+ > Get more help at https://help.gradle.org.
20
+
21
+ BUILD FAILED in 413ms
2ocr.sh ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Directory containing TIFF files
4
+ INPUT_DIR="atreatiseonlawp00chitgoog_tif"
5
+ OUTPUT_PDF="output_searchable.pdf"
6
+ TEMP_DIR="temp_ocr"
7
+
8
+ # Create a temporary directory to store processed files
9
+ mkdir -p "$TEMP_DIR"
10
+
11
+ # Process each TIFF file
12
+ for file in "$INPUT_DIR"/*.tif; do
13
+ # Extract the filename without extension
14
+ filename=$(basename "$file" .tif)
15
+
16
+ # Run Tesseract on each file and output a PDF for each page
17
+ tesseract "$file" "$TEMP_DIR/$filename" -l eng pdf
18
+ done
19
+
20
+ # Combine all individual page PDFs into a single PDF
21
+ if command -v pdfunite >/dev/null 2>&1; then
22
+ # If pdfunite is available (from poppler-utils), use it
23
+ pdfunite "$TEMP_DIR"/*.pdf "$OUTPUT_PDF"
24
+ else
25
+ # Fallback to using ImageMagick's `convert` if `pdfunite` isn't available
26
+ convert "$TEMP_DIR"/*.pdf "$OUTPUT_PDF"
27
+ fi
28
+
29
+ # Clean up temporary directory
30
+ rm -r "$TEMP_DIR"
31
+
32
+ echo "Searchable PDF created as $OUTPUT_PDF"
aidocs.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, field
2
+ from typing import Optional, List, Dict, Set, Literal, Tuple, NamedTuple, Union
3
+ from pathlib import Path
4
+ import re
5
+ import fnmatch
6
+ import glob
7
+ from itertools import chain
8
+
9
+ @dataclass
10
+ class PathPattern:
11
+ """Represents either a direct mapping or a wildcard pattern."""
12
+ pattern: str
13
+ target_template: Optional[str] = None
14
+
15
+ @classmethod
16
+ def parse(cls, spec: str) -> 'PathPattern':
17
+ """Parse path specification into pattern and optional target."""
18
+ if ':' in spec:
19
+ source, target = spec.split(':', 1)
20
+ return cls(source, target)
21
+ return cls(spec)
22
+
23
+ def resolve(self, root_dir: Path) -> List[PathMapping]:
24
+ """Resolve pattern into concrete path mappings."""
25
+ if self.target_template is not None:
26
+ # Direct mapping case
27
+ return [PathMapping(Path(self.pattern), Path(self.target_template))]
28
+
29
+ # Wildcard pattern case
30
+ matches = []
31
+ for path in glob.glob(self.pattern, recursive=True):
32
+ source = Path(path)
33
+ if source.is_file():
34
+ # For files, maintain relative structure
35
+ relative = source.relative_to(root_dir) if root_dir in source.parents else source
36
+ matches.append(PathMapping(source, relative))
37
+ return matches
38
+
39
+ def validate(self) -> None:
40
+ """Validate pattern constraints."""
41
+ if self.target_template:
42
+ # Check for path traversal in target
43
+ if '..' in self.target_template:
44
+ raise ValueError(f"Target path '{self.target_template}' cannot contain '..'")
45
+
46
+ # Normalize path separators
47
+ if '\\' in self.target_template:
48
+ raise ValueError(f"Target path must use forward slashes")
49
+
50
+ # Validate wildcard pattern
51
+ if any(c in self.pattern for c in '<>|"'):
52
+ raise ValueError(f"Invalid characters in pattern: {self.pattern}")
53
+
54
+ class WikiTransformer:
55
+ def __init__(self, size_limit: 'SizeSpec', output_dir: Path,
56
+ merge_strategy: MergeStrategy,
57
+ debug: bool = False):
58
+ self.validator = SizeValidator(size_limit)
59
+ self.output_dir = output_dir
60
+ self.merge_strategy = merge_strategy
61
+ self.debug = debug
62
+ self.console = Console()
63
+ self.log = self._setup_logging()
64
+ self.processed_inodes: Set[int] = set()
65
+ self.root_dir = Path.cwd()
66
+
67
+ async def resolve_patterns(self, patterns: List[str]) -> List[PathMapping]:
68
+ """Resolve all patterns into concrete mappings."""
69
+ mappings = []
70
+ for spec in patterns:
71
+ try:
72
+ pattern = PathPattern.parse(spec)
73
+ pattern.validate()
74
+ resolved = pattern.resolve(self.root_dir)
75
+ if not resolved:
76
+ self.log.warning(f"Pattern '{spec}' matched no files")
77
+ mappings.extend(resolved)
78
+ except ValueError as e:
79
+ self.log.error(f"Invalid pattern '{spec}': {e}")
80
+ continue
81
+ return mappings
82
+
83
+ async def transform(self, patterns: List[str]):
84
+ """Transform source trees based on patterns and mappings."""
85
+ mappings = await self.resolve_patterns(patterns)
86
+
87
+ if not mappings:
88
+ raise ValueError("No valid paths matched the specified patterns")
89
+
90
+ if not self.merge_strategy.validate_target(self.output_dir):
91
+ raise ValueError(
92
+ f"Target filesystem doesn't support {self.merge_strategy.link_type} links"
93
+ )
94
+
95
+ self.output_dir.mkdir(parents=True, exist_ok=True)
96
+
97
+ with Progress() as progress:
98
+ task = progress.add_task(
99
+ "[green]Processing files...",
100
+ total=len(mappings)
101
+ )
102
+
103
+ for mapping in mappings:
104
+ try:
105
+ await self.process_mapping(mapping)
106
+ progress.update(task, advance=1)
107
+ except Exception as e:
108
+ self.log.error(f"Failed to process {mapping}: {e}")
109
+
110
+ @click.command()
111
+ @click.argument('patterns', nargs=-1, required=True,
112
+ help="Path patterns (e.g., 'src:docs/api' or '**/*.md')")
113
+ @click.option('-l', '--limit', type=SIZE, default='1M',
114
+ help='Per-document size limit (e.g., 500K, 2M, 1G)')
115
+ @click.option('-d', '--debug', is_flag=True, help='Enable debug logging')
116
+ @click.option('-o', '--output-dir', type=click.Path(), default='wiki',
117
+ help='Output directory')
118
+ @click.option('--link-type', type=click.Choice(['symlink', 'hardlink', 'copy']),
119
+ default='symlink', help='File linking strategy')
120
+ @click.option('--follow-links/--no-follow-links', default=False,
121
+ help='Follow symbolic links during traversal')
122
+ def main(patterns: List[str], limit: SizeSpec, debug: bool,
123
+ output_dir: str, link_type: str, follow_links: bool):
124
+ """Transform files into wiki structure using patterns or mappings.
125
+
126
+ PATTERNS can be either:
127
+ 1. Colon-separated mappings: 'source:target'
128
+ 2. Wildcard patterns: '**/*.md', 'docs/**/*.rst'
129
+
130
+ Examples:
131
+ # Explicit mapping
132
+ wiki_transform.py src/api:docs/api docs/intro:guide/start
133
+
134
+ # Wildcard patterns
135
+ wiki_transform.py '**/*.md' 'docs/**/*.rst'
136
+
137
+ # Mixed usage
138
+ wiki_transform.py src:api '**/*.md' 'legacy:archive'
139
+ """
140
+ strategy = MergeStrategy(
141
+ link_type=None if link_type == 'copy' else link_type,
142
+ follow_links=follow_links
143
+ )
144
+
145
+ transformer = WikiTransformer(
146
+ size_limit=limit,
147
+ output_dir=Path(output_dir),
148
+ merge_strategy=strategy,
149
+ debug=debug
150
+ )
151
+
152
+ asyncio.run(transformer.transform(patterns))
153
+
154
+ if __name__ == '__main__':
155
+ main()
jpegdir.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from PIL import Image
3
+ import pytesseract
4
+ from pathlib import Path
5
+ import json
6
+ from typing import Dict, List
7
+ from concurrent.futures import ProcessPoolExecutor
8
+ import multiprocessing
9
+
10
+ def process_image(args) -> tuple:
11
+ """
12
+ Process a single image file.
13
+
14
+ Args:
15
+ args: Tuple of (filename, input_dir, output_dir)
16
+ Returns:
17
+ Tuple of (filename, extracted_text)
18
+ """
19
+ filename, input_dir, output_dir = args
20
+ try:
21
+ # Full path to image
22
+ image_path = os.path.join(input_dir, filename)
23
+
24
+ # Open and process image
25
+ with Image.open(image_path) as img:
26
+ # Extract text using pytesseract
27
+ text = pytesseract.image_to_string(img)
28
+
29
+ # Save individual text file
30
+ text_filename = Path(filename).stem + '.txt'
31
+ text_path = os.path.join(output_dir, text_filename)
32
+ with open(text_path, 'w', encoding='utf-8') as f:
33
+ f.write(text)
34
+
35
+ print(f"Processed: {filename}")
36
+ return filename, text
37
+
38
+ except Exception as e:
39
+ print(f"Error processing {filename}: {str(e)}")
40
+ return filename, f"ERROR: {str(e)}"
41
+
42
+ def process_directory(input_dir: str, output_dir: str, max_workers: int = None) -> Dict[str, str]:
43
+ """
44
+ Process all JPEG files in a directory and perform OCR using multiple processes.
45
+
46
+ Args:
47
+ input_dir: Directory containing JPEG files
48
+ output_dir: Directory to save OCR results
49
+ max_workers: Maximum number of worker processes (defaults to CPU count)
50
+
51
+ Returns:
52
+ Dictionary mapping filenames to extracted text
53
+ """
54
+ # Create output directory if it doesn't exist
55
+ Path(output_dir).mkdir(parents=True, exist_ok=True)
56
+
57
+ # If max_workers not specified, use CPU count
58
+ if max_workers is None:
59
+ max_workers = multiprocessing.cpu_count()
60
+
61
+ # Supported image extensions
62
+ valid_extensions = {'.jpg', '.jpeg', '.JPG', '.JPEG'}
63
+
64
+ # Get list of valid image files
65
+ image_files = [
66
+ f for f in os.listdir(input_dir)
67
+ if Path(f).suffix in valid_extensions
68
+ ]
69
+
70
+ # Prepare arguments for worker processes
71
+ work_args = [(f, input_dir, output_dir) for f in image_files]
72
+
73
+ # Process files concurrently
74
+ results = {}
75
+ with ProcessPoolExecutor(max_workers=max_workers) as executor:
76
+ for filename, text in executor.map(process_image, work_args):
77
+ results[filename] = text
78
+
79
+ # Save consolidated results to JSON
80
+ json_path = os.path.join(output_dir, 'ocr_results.json')
81
+ with open(json_path, 'w', encoding='utf-8') as f:
82
+ json.dump(results, f, indent=2, ensure_ascii=False)
83
+
84
+ return results
85
+
86
+ if __name__ == "__main__":
87
+ import argparse
88
+
89
+ parser = argparse.ArgumentParser(description='Perform OCR on all JPEG files in a directory')
90
+ parser.add_argument('input_dir', help='Input directory containing JPEG files')
91
+ parser.add_argument('output_dir', help='Output directory for OCR results')
92
+ parser.add_argument('--workers', type=int, help='Number of worker processes (default: CPU count)',
93
+ default=None)
94
+
95
+ args = parser.parse_args()
96
+
97
+ results = process_directory(args.input_dir, args.output_dir, args.workers)
98
+ print(f"\nProcessed {len(results)} files. Results saved to {args.output_dir}")
random/index.html ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Force-Directed Voronoi Diagram</title>
7
+ <script src="https://d3js.org/d3.v7.min.js"></script>
8
+ <style>
9
+ body {
10
+ margin: 0;
11
+ overflow: hidden;
12
+ }
13
+ svg {
14
+ display: block;
15
+ }
16
+ </style>
17
+ </head>
18
+ <body>
19
+ <script>
20
+ // Dimensions
21
+ const width = window.innerWidth;
22
+ const height = window.innerHeight;
23
+
24
+ // Create SVG container
25
+ const svg = d3.select("body")
26
+ .append("svg")
27
+ .attr("width", width)
28
+ .attr("height", height);
29
+
30
+ // Random initial dataset
31
+ let data = d3.range(20).map(() => ({
32
+ x: Math.random() * width,
33
+ y: Math.random() * height,
34
+ value: Math.random()
35
+ }));
36
+
37
+ // Force simulation
38
+ const simulation = d3.forceSimulation(data)
39
+ .force("x", d3.forceX(d => d.x).strength(0.5))
40
+ .force("y", d3.forceY(d => d.y).strength(0.5))
41
+ .force("collide", d3.forceCollide(50))
42
+ .on("tick", update);
43
+
44
+ // Voronoi diagram generator
45
+ const voronoi = d3.voronoi()
46
+ .x(d => d.x)
47
+ .y(d => d.y)
48
+ .extent([[0, 0], [width, height]]);
49
+
50
+ // Group for Voronoi cells
51
+ const voronoiGroup = svg.append("g");
52
+
53
+ // Group for circles
54
+ const circleGroup = svg.append("g");
55
+
56
+ function update() {
57
+ const diagram = voronoi(data);
58
+
59
+ // Update Voronoi cells
60
+ const cells = voronoiGroup.selectAll("path")
61
+ .data(data);
62
+
63
+ cells.enter()
64
+ .append("path")
65
+ .merge(cells)
66
+ .attr("d", (d, i) => diagram.renderCell(i))
67
+ .attr("fill", d => d3.interpolateRainbow(d.value))
68
+ .attr("stroke", "#000");
69
+
70
+ cells.exit().remove();
71
+
72
+ // Update circles
73
+ const circles = circleGroup.selectAll("circle")
74
+ .data(data);
75
+
76
+ circles.enter()
77
+ .append("circle")
78
+ .merge(circles)
79
+ .attr("r", 5)
80
+ .attr("fill", "black")
81
+ .attr("cx", d => d.x)
82
+ .attr("cy", d => d.y);
83
+
84
+ circles.exit().remove();
85
+ }
86
+
87
+ // Add a new data point every 2 seconds
88
+ setInterval(() => {
89
+ data.push({
90
+ x: Math.random() * width,
91
+ y: Math.random() * height,
92
+ value: Math.random()
93
+ });
94
+
95
+ simulation.nodes(data);
96
+ simulation.alpha(1).restart();
97
+ }, 2000);
98
+ </script>
99
+ </body>
100
+ </html>
shove.sh ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Variables
4
+ BATCH_SIZE=30
5
+ COUNTER=0
6
+ OUTPUT_DIR="batches"
7
+ RESULTS_FILE="ocr_results.txt"
8
+
9
+ # Create output directory if not exists
10
+ mkdir -p "$OUTPUT_DIR"
11
+
12
+ # Clear results file
13
+ : > "$RESULTS_FILE"
14
+
15
+ # Loop through PNG files in batches of $BATCH_SIZE
16
+ for FILE in output-*.png; do
17
+ # Add file to batch array
18
+ BATCH_FILES+=("$FILE")
19
+ COUNTER=$((COUNTER + 1))
20
+
21
+ # Process batch when size is reached or on the last file
22
+ if (( COUNTER % BATCH_SIZE == 0 || COUNTER == $(ls output-*.png | wc -l) )); then
23
+ # Create batch file name
24
+ BATCH_NAME="${OUTPUT_DIR}/batch_$((COUNTER / BATCH_SIZE)).png"
25
+
26
+ # Use ffmpeg to concatenate files vertically
27
+ ffmpeg -y -i "concat:$(printf '%s|' "${BATCH_FILES[@]}" | sed 's/|$//')" -vf vstack "$BATCH_NAME"
28
+
29
+ # Run easyocr on the concatenated image
30
+ echo "Processing $BATCH_NAME..."
31
+ easyocr -l en -f "$BATCH_NAME" --gpu True >> "$RESULTS_FILE"
32
+
33
+ # Reset batch files array
34
+ BATCH_FILES=()
35
+ fi
36
+ done
37
+
38
+ echo "OCR processing complete. Results saved to $RESULTS_FILE."
showfiles ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Mission Statement:
4
+ # This script displays the contents of specified files with formatted headers.
5
+ # It provides optional file size limits through the -k flag (specified in KB).
6
+ # Without the -k flag, files are shown in their entirety.
7
+ # With -k flag, files larger than the specified size are truncated with a warning.
8
+ # The script handles both Linux and MacOS systems using compatible stat commands.
9
+ # Color output is available via the -c flag for better visual organization.
10
+
11
+ # ANSI color codes
12
+ BLUE='\033[0;34m'
13
+ GREEN='\033[0;32m'
14
+ RED='\033[0;31m'
15
+ NC='\033[0m' # No Color
16
+
17
+ usage() {
18
+ echo "Usage: $(basename $0) [-k size_in_kb] [-c] files..."
19
+ echo " -k: Maximum file size in KB (optional)"
20
+ echo " -c: Enable color output"
21
+ exit 1
22
+ }
23
+
24
+ # Initialize variables
25
+ COLOR=false
26
+ MAX_SIZE_KB=""
27
+
28
+ # Parse command line options
29
+ while getopts "k:c" opt; do
30
+ case $opt in
31
+ k) MAX_SIZE_KB="$OPTARG";;
32
+ c) COLOR=true;;
33
+ ?) usage;;
34
+ esac
35
+ done
36
+
37
+ # Shift past the options
38
+ shift $((OPTIND-1))
39
+
40
+ # Check if any files were specified
41
+ if [ $# -eq 0 ]; then
42
+ usage
43
+ fi
44
+
45
+ # Get file size in KB (compatible with both Linux and MacOS)
46
+ get_file_size() {
47
+ if [[ "$OSTYPE" == "darwin"* ]]; then
48
+ stat -f %z "$1"
49
+ else
50
+ stat --format=%s "$1"
51
+ fi
52
+ }
53
+
54
+ # Format and display file header
55
+ show_header() {
56
+ local file="$1"
57
+ local size_bytes=$(get_file_size "$file")
58
+ local size_kb=$((size_bytes / 1024))
59
+
60
+ if $COLOR; then
61
+ echo -e "\n${BLUE}=== File: ${GREEN}$file${BLUE} (${size_kb}KB) ===${NC}"
62
+ else
63
+ echo -e "\n=== File: $file (${size_kb}KB) ==="
64
+ fi
65
+ }
66
+
67
+ # Process each file
68
+ for file in "$@"; do
69
+ if [ ! -f "$file" ]; then
70
+ if $COLOR; then
71
+ echo -e "${RED}Error: '$file' does not exist or is not a regular file${NC}" >&2
72
+ else
73
+ echo "Error: '$file' does not exist or is not a regular file" >&2
74
+ fi
75
+ continue
76
+ fi
77
+
78
+ show_header "$file"
79
+
80
+ if [ -n "$MAX_SIZE_KB" ]; then
81
+ size_bytes=$(get_file_size "$file")
82
+ size_kb=$((size_bytes / 1024))
83
+
84
+ if [ $size_kb -gt $MAX_SIZE_KB ]; then
85
+ if $COLOR; then
86
+ echo -e "${RED}File size ($size_kb KB) exceeds limit ($MAX_SIZE_KB KB). Showing first $MAX_SIZE_KB KB:${NC}"
87
+ else
88
+ echo "File size ($size_kb KB) exceeds limit ($MAX_SIZE_KB KB). Showing first $MAX_SIZE_KB KB:"
89
+ fi
90
+ head -c $((MAX_SIZE_KB * 1024)) "$file"
91
+ echo -e "\n[Truncated...]"
92
+ else
93
+ cat "$file"
94
+ fi
95
+ else
96
+ cat "$file"
97
+ fi
98
+ done
skel.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!python3
2
+ import unittest
3
+ from pathlib import Path
4
+ import tempfile
5
+ import os
6
+
7
+ class TestSkeletonMapper(unittest.TestCase):
8
+ def setUp(self):
9
+ self.temp_dir = tempfile.mkdtemp()
10
+ self.patterns = create_language_patterns()
11
+
12
+ def create_test_file(self, content: str, extension: str) -> str:
13
+ path = Path(self.temp_dir) / f"test{extension}"
14
+ path.write_text(content)
15
+ return str(path)
16
+
17
+ def test_kotlin_edge_cases(self):
18
+ kotlin_code = '''
19
+ @DslMarker
20
+ annotation class NioProxyDsl
21
+
22
+ interface EnhancedNioProxy<T : Any> {
23
+ val original: T
24
+ fun verifyIdentity(): Boolean = enhanced.equals(original)
25
+ }
26
+
27
+ class ProxyContext {
28
+ private val _events = MutableSharedFlow<ProxyEvent>()
29
+ }
30
+ '''
31
+ file_path = self.create_test_file(kotlin_code, ".kt")
32
+ results = extract_skeleton(file_path, self.patterns)
33
+
34
+ # BUG 1: Missing generic type parameters in class/interface detection
35
+ self.assertIn("interface EnhancedNioProxy<T : Any>", results['interface'])
36
+
37
+ # BUG 2: Property detection fails with initialization
38
+ self.assertIn("val original: T", results['property'])
39
+
40
+ # BUG 3: Annotation detection drops parameters
41
+ self.assertIn("@DslMarker", results['annotation'])
42
+
43
+ def fix_kotlin_patterns():
44
+ return {
45
+ 'class': r'^\s*(?:data\s+)?class\s+(\w+)(?:<[^>]+>)?',
46
+ 'function': r'^\s*fun\s+(\w+)(?:<[^>]+>)?',
47
+ 'property': r'^\s*(?:var|val)\s+(\w+)(?:\s*:\s*[^=]+)?(?:\s*=.+)?',
48
+ 'interface': r'^\s*interface\s+(\w+)(?:<[^>]+>)?',
49
+ 'annotation': r'^\s*@(\w+)(?:\s*[\w\s.()]+)?',
50
+ 'suspend': r'^\s*suspend\s+fun\s+\w+',
51
+ }
52
+
53
+ # Critical fixes for main implementation
54
+ def patch_implementation():
55
+ """
56
+ Critical patches for identified issues
57
+ """
58
+ # 1. Fix subprocess handling for large files
59
+ def safe_grep(cmd: str, timeout: int = 30) -> str:
60
+ try:
61
+ return subprocess.run(
62
+ cmd,
63
+ shell=True,
64
+ text=True,
65
+ capture_output=True,
66
+ timeout=timeout
67
+ ).stdout
68
+ except subprocess.TimeoutExpired:
69
+ return ""
70
+
71
+ # 2. Fix pattern escaping in grep command
72
+ def escape_grep_pattern(pattern: str) -> str:
73
+ return pattern.replace('(', '\\(').replace(')', '\\)')
74
+
75
+ # 3. Add file encoding handling
76
+ def read_file_safe(file_path: str) -> str:
77
+ try:
78
+ with open(file_path, 'r', encoding='utf-8') as f:
79
+ return f.read()
80
+ except UnicodeDecodeError:
81
+ try:
82
+ with open(file_path, 'r', encoding='latin-1') as f:
83
+ return f.read()
84
+ except Exception:
85
+ return ""
86
+
87
+ return safe_grep, escape_grep_pattern, read_file_safe
88
+
89
+ # Shell script fixes
90
+ def generate_fixed_shell_script():
91
+ return '''
92
+ #!/bin/bash
93
+
94
+ # Fixed file handling
95
+ while IFS= read -r -d '' file; do
96
+ if [[ ! -f "$file" ]]; then
97
+ continue
98
+ fi
99
+
100
+ # Handle filenames with spaces and special chars
101
+ file_ext="${file##*.}"
102
+ file_name=$(printf '%q' "$file")
103
+
104
+ # Prevent grep pattern injection
105
+ safe_grep() {
106
+ local pattern=$1
107
+ local file=$2
108
+ grep -E "^[[:space:]]*${pattern}" "$file" 2>/dev/null || true
109
+ }
110
+
111
+ case "$file_ext" in
112
+ kt|kts)
113
+ safe_grep "(@\\w+|class\\s+\\w+|interface\\s+\\w+|fun\\s+\\w+)" "$file_name"
114
+ ;;
115
+ # ... other extensions
116
+ esac
117
+ done < <(find . -type f -print0)
118
+ '''
119
+
120
+ # Runtime monitoring hooks
121
+ def add_monitoring():
122
+ import time
123
+ import psutil
124
+
125
+ def monitor_execution(func):
126
+ def wrapper(*args, **kwargs):
127
+ start = time.time()
128
+ process = psutil.Process()
129
+ mem_before = process.memory_info().rss
130
+
131
+ result = func(*args, **kwargs)
132
+
133
+ elapsed = time.time() - start
134
+ mem_after = process.memory_info().rss
135
+ mem_delta = mem_after - mem_before
136
+
137
+ if elapsed > 5.0 or mem_delta > 100*1024*1024: # 100MB
138
+ print(f"Warning: High resource usage in {func.__name__}")
139
+
140
+ return result
141
+ return wrapper
142
+
143
+ return monitor_execution
summ ADDED
File without changes
summarize2 ADDED
@@ -0,0 +1,415 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+ # CHARTER: This script is designed to process audio/video content, generate transcripts,
4
+ # summaries, and audio files. The following tasks are defined and must not be altered by any LLM:
5
+ # 1. Download content if a URL is provided
6
+ # 2. Generate or use existing VTT transcript
7
+ # 3. Clean the VTT transcript
8
+ # 4. Generate a summary of the transcript
9
+ # 5. Create a 16k VBR Opus OGG file for audio tracks (unless audio download is disabled)
10
+ # 6. Output both the cleaned VTT text and the summary
11
+ # 7. Exclude the WAV file from the results
12
+ # 8. Include the OGG file in the results only if both WAV and OGG were created
13
+ # This charter is unalterable and defines the core functionality of the script.
14
+
15
+ # Configuration (adjust these paths)
16
+ WHISPCC="$HOME/work/whisper.cpp" # ./main to run ; ./models for models
17
+ MODEL_PATH="$WHISPCC/models/ggml-small.en-tdrz.bin"
18
+ OUTPUT_DIR="$HOME/processed_audio"
19
+ CACHE_DIR="/tmp/summarize_cache"
20
+ OLLAMA_MODEL="llama3.1:latest"
21
+ OLLAMA_MODEL="deepseek-coder-v2:16b"
22
+
23
+ # Prompts for different segments
24
+ FIRST_PROMPT="Summarize this beginning part of a transcript in one sentence, then provide bullet points with timestamps (00:00:00 sentence)."
25
+ MIDDLE_PROMPT="Summarize the key points of this part of the transcript in bullet points with timestamps (00:00:00 sentence)."
26
+ LAST_PROMPT="Summarize the main takeaways of this final part of the transcript in bullet points with timestamps (00:00:00 sentence)."
27
+
28
+ # Global variable to track job queue
29
+ JOB_QUEUE=()
30
+
31
+ # Ensure output and cache directories exist
32
+ mkdir -p "$OUTPUT_DIR" "$CACHE_DIR"
33
+
34
+ # Parse command line options
35
+ USE_FABRIC=false
36
+ DISABLE_AUDIO=false
37
+ DURATION=""
38
+ while getopts "fnad:" opt; do
39
+ case $opt in
40
+ f)
41
+ USE_FABRIC=true
42
+ ;;
43
+ n)
44
+ DISABLE_AUDIO=true
45
+ ;;
46
+ a)
47
+ DISABLE_AUDIO=false
48
+ ;;
49
+ d)
50
+ DURATION="$OPTARG"
51
+ ;;
52
+ \?)
53
+ echo "Invalid option: -$OPTARG" >&2
54
+ exit 1
55
+ ;;
56
+ esac
57
+ done
58
+ shift $((OPTIND-1))
59
+
60
+ # Function to get MD5 hash of a file
61
+ get_md5() {
62
+ md5sum "$1" | cut -d' ' -f1
63
+ }
64
+
65
+ # Function to cache a file using hardlinks (atomic)
66
+ cache_file() {
67
+ local INPUT_FILE="$1"
68
+ local EXTENSION="$2"
69
+
70
+ # Check if the input file exists and is not empty
71
+ if [ ! -s "$INPUT_FILE" ]; then
72
+ echo "Error: Input file is empty or does not exist." >&2
73
+ return 1
74
+ fi
75
+
76
+ local MD5=$(get_md5 "$INPUT_FILE")
77
+ local CACHE_SUBDIR="$CACHE_DIR/${MD5:0:2}/${MD5:2:2}"
78
+ local SAFE_FILENAME=$(echo "$INPUT_FILE" | sed 's/[^a-zA-Z0-9._-]/_/g')
79
+ local CACHE_FILE="$CACHE_SUBDIR/${MD5}_${SAFE_FILENAME}${EXTENSION}"
80
+
81
+ echo "Cache operation: MD5 sum = $MD5" >&2
82
+ echo "Cache file: $CACHE_FILE" >&2
83
+
84
+ # Create cache subdirectory if it doesn't exist
85
+ if ! mkdir -p "$CACHE_SUBDIR"; then
86
+ echo "Error: Failed to create cache subdirectory." >&2
87
+ return 1
88
+ fi
89
+
90
+ # Attempt to create the hardlink
91
+ if ln -f "$INPUT_FILE" "$CACHE_FILE"; then
92
+ echo "Cache file created: $CACHE_FILE" >&2
93
+ echo "$CACHE_FILE"
94
+ return 0
95
+ else
96
+ echo "Error: Failed to create cache file." >&2
97
+ return 1
98
+ fi
99
+ }
100
+
101
+ # Function to sanitize a string for use as a filename
102
+ sanitize_filename() {
103
+ local STRING="$1"
104
+ echo "$STRING" | iconv -c -t ascii//translit | sed 's/[^A-Za-z0-9._-]/_/g' | tr '[:upper:]' '[:lower:]'
105
+ }
106
+
107
+ # Function to clean text from a VTT file
108
+ clean_text() {
109
+ sed 's/<[^>]*>//g' | tr -s ' ' | sed 's/^[ \t]*//;s/[ \t]*$//'
110
+ }
111
+
112
+ # Function to summarize a segment of text
113
+ summarize_segment() {
114
+ local SEGMENT_TEXT="$1"
115
+ local PROMPT="$2"
116
+ local SUMMARY_OUTPUT=""
117
+
118
+ # Count the number of lines in the input
119
+ local LINE_COUNT=$(echo "$SEGMENT_TEXT" | wc -l)
120
+
121
+ # If the input has less than 12 lines, remove cache and return a simple response
122
+ if [ "$LINE_COUNT" -lt 12 ]; then
123
+ local MD5=$(echo "$SEGMENT_TEXT" | md5sum | cut -d' ' -f1)
124
+ local CACHE_SUBDIR="$CACHE_DIR/${MD5:0:2}/${MD5:2:2}"
125
+ rm -f "$CACHE_SUBDIR/$MD5"*
126
+ echo "The input is too short for meaningful summarization. Cache entry removed. Here's the original text:"
127
+ echo "$SEGMENT_TEXT"
128
+ return 0
129
+ fi
130
+
131
+ if $USE_FABRIC; then
132
+ SUMMARY_OUTPUT=$(fabric -p summarize "$SEGMENT_TEXT" 2>&1)
133
+ else
134
+ # Use ollama for summarization
135
+ SUMMARY_OUTPUT=$(ollama run "$OLLAMA_MODEL" "$PROMPT" "$SEGMENT_TEXT" 2>&1)
136
+ fi
137
+
138
+ if [ $? -ne 0 ]; then
139
+ echo "Error in summarization: $SUMMARY_OUTPUT" >&2
140
+ return 1
141
+ fi
142
+
143
+ echo "$SUMMARY_OUTPUT"
144
+ }
145
+
146
+ # Function to add a job to the queue
147
+ add_job() {
148
+ JOB_QUEUE+=("$@")
149
+ }
150
+
151
+ # Function to update the progress bar for a job
152
+ update_job_progress() {
153
+ local JOB_INDEX="$1"
154
+ local TOTAL_STEPS="$2"
155
+ local CURRENT_STEP="$3"
156
+ local JOB_MESSAGE="$4"
157
+
158
+ # ... (Implementation for updating the TUI progress bar)
159
+ # You can use a library like 'whiptail' or 'dialog' for TUI elements
160
+ # Example using echo for now:
161
+ echo "Job $((JOB_INDEX+1))/$JOB_COUNT: $JOB_MESSAGE ($CURRENT_STEP/$TOTAL_STEPS)"
162
+ }
163
+
164
+ # Function to process the job queue
165
+ process_job_queue() {
166
+ local JOB_COUNT=${#JOB_QUEUE[@]}
167
+ echo "Processing job queue ($JOB_COUNT jobs)..."
168
+ for (( i=0; i<JOB_COUNT; i++ )); do
169
+ # Remove update_job_progress calls
170
+ eval "${JOB_QUEUE[$i]}"
171
+ done
172
+ }
173
+
174
+ # Function to process a single segment
175
+ process_segment() {
176
+ local SEGMENT_TEXT="$1"
177
+ local PROMPT="$2"
178
+ local OUTPUT_FILE="$3"
179
+ local SUMMARY_OUTPUT=""
180
+
181
+ # Count the number of lines in the input
182
+ local LINE_COUNT=$(echo "$SEGMENT_TEXT" | wc -l)
183
+
184
+ # If the input has less than 12 lines, remove cache and return a simple response
185
+ if [ "$LINE_COUNT" -lt 12 ]; then
186
+ local MD5=$(echo "$SEGMENT_TEXT" | md5sum | cut -d' ' -f1)
187
+ local CACHE_SUBDIR="$CACHE_DIR/${MD5:0:2}/${MD5:2:2}"
188
+ rm -f "$CACHE_SUBDIR/$MD5"*
189
+ echo "The input is too short for meaningful summarization. Cache entry removed. Here's the original text:"
190
+ echo "$SEGMENT_TEXT" > "$OUTPUT_FILE"
191
+ return 0
192
+ fi
193
+
194
+ if $USE_FABRIC; then
195
+ SUMMARY_OUTPUT=$(fabric -p summarize "$SEGMENT_TEXT" 2>&1)
196
+ else
197
+ # Use ollama for summarization
198
+ SUMMARY_OUTPUT=$(ollama run "$OLLAMA_MODEL" "$PROMPT" "$SEGMENT_TEXT" 2>&1)
199
+ fi
200
+
201
+ if [ $? -ne 0 ]; then
202
+ echo "Error in summarization: $SUMMARY_OUTPUT" >&2
203
+ return 1
204
+ fi
205
+
206
+ # Write the summary to the specified output file
207
+ echo "$SUMMARY_OUTPUT" > "$OUTPUT_FILE"
208
+ }
209
+
210
+ # Function to process a VTT file (generate summary and handle versioning)
211
+ process_vtt() {
212
+ local VTT_FILE=$1
213
+ local URL=$2
214
+ local TEMP_DIR=$(mktemp -d)
215
+ local BASE_NAME="${TEMP_DIR}/temp" # Temporary base name
216
+ local CLEANED_TRANSCRIPT="${BASE_NAME}_cleaned.txt"
217
+ local SUMMARY_FILE="${OUTPUT_DIR}/$(basename "$VTT_FILE" .vtt)_summary.txt"
218
+
219
+ echo "Processing VTT file: $VTT_FILE"
220
+
221
+ # Clean the VTT transcript
222
+ if ! python3 "$(dirname "$0")/vttclean.py" "$VTT_FILE" > "$CLEANED_TRANSCRIPT" 2>"${CLEANED_TRANSCRIPT}.error"; then
223
+ echo "Error: Failed to clean the VTT file. Error log:" >&2
224
+ cat "${CLEANED_TRANSCRIPT}.error" >&2
225
+ exit 1
226
+ fi
227
+
228
+ # Check if the cleaned transcript is empty
229
+ if [ ! -s "$CLEANED_TRANSCRIPT" ]; then
230
+ echo "Error: Cleaned transcript is empty." >&2
231
+ exit 1
232
+ fi
233
+
234
+ # Generate summary
235
+ echo "Summarizing transcript..."
236
+ local TOTAL_LINES=$(wc -l < "$CLEANED_TRANSCRIPT")
237
+ local SEGMENT_SIZE=$((TOTAL_LINES / 3))
238
+ local FIRST_SEGMENT=$(head -n $SEGMENT_SIZE "$CLEANED_TRANSCRIPT")
239
+ local MIDDLE_SEGMENT=$(sed -n "$((SEGMENT_SIZE + 1)),$((2 * SEGMENT_SIZE))p" "$CLEANED_TRANSCRIPT")
240
+ local LAST_SEGMENT=$(tail -n $SEGMENT_SIZE "$CLEANED_TRANSCRIPT")
241
+
242
+ {
243
+ echo "Generating summary for first segment..."
244
+ if $USE_FABRIC; then
245
+ fabric -p summarize "$FIRST_SEGMENT"
246
+ else
247
+ ollama run "$OLLAMA_MODEL" "$FIRST_PROMPT" "$FIRST_SEGMENT"
248
+ fi
249
+
250
+ echo "Generating summary for middle segment..."
251
+ if $USE_FABRIC; then
252
+ fabric -p summarize "$MIDDLE_SEGMENT"
253
+ else
254
+ ollama run "$OLLAMA_MODEL" "$MIDDLE_PROMPT" "$MIDDLE_SEGMENT"
255
+ fi
256
+
257
+ echo "Generating summary for last segment..."
258
+ if $USE_FABRIC; then
259
+ fabric -p summarize "$LAST_SEGMENT"
260
+ else
261
+ ollama run "$OLLAMA_MODEL" "$LAST_PROMPT" "$LAST_SEGMENT"
262
+ fi
263
+ } > "$SUMMARY_FILE"
264
+
265
+ if [ ! -s "$SUMMARY_FILE" ]; then
266
+ echo "Error: Summary generation failed." >&2
267
+ exit 1
268
+ fi
269
+
270
+ echo "Summarization complete."
271
+
272
+ # Display the content of the summary file
273
+ echo "Summary content:"
274
+ echo "----------------------------------------"
275
+ cat "$SUMMARY_FILE"
276
+ echo "----------------------------------------"
277
+
278
+ # Clean up
279
+ rm -rf "$TEMP_DIR"
280
+ }
281
+
282
+ # Function to calculate the time difference between two timestamps in HH:MM:SS format
283
+ time_difference() {
284
+ local TIME1="$1" # Format: HH:MM:SS
285
+ local TIME2="$2" # Format: HH:MM:SS
286
+
287
+ # Extract hours, minutes, and seconds from timestamps
288
+ local TIME1_HOUR=$(echo "$TIME1" | cut -d: -f1)
289
+ local TIME1_MINUTE=$(echo "$TIME1" | cut -d: -f2)
290
+ local TIME1_SECOND=$(echo "$TIME1" | cut -d: -f3)
291
+
292
+ local TIME2_HOUR=$(echo "$TIME2" | cut -d: -f1)
293
+ local TIME2_MINUTE=$(echo "$TIME2" | cut -d: -f2)
294
+ local TIME2_SECOND=$(echo "$TIME2" | cut -d: -f3)
295
+
296
+ # Calculate total seconds for each timestamp
297
+ local TIME1_TOTAL_SECONDS=$((TIME1_HOUR * 3600 + TIME1_MINUTE * 60 + TIME1_SECOND))
298
+ local TIME2_TOTAL_SECONDS=$((TIME2_HOUR * 3600 + TIME2_MINUTE * 60 + TIME2_SECOND))
299
+
300
+ # Calculate the difference in seconds
301
+ local DIFF_SECONDS=$((TIME1_TOTAL_SECONDS - TIME2_TOTAL_SECONDS))
302
+
303
+ # Return the difference (could be negative if TIME2 is later than TIME1)
304
+ echo "$DIFF_SECONDS"
305
+ }
306
+
307
+ # Main script logic
308
+ if [ $# -eq 0 ]; then
309
+ echo "Error: No input provided. Please provide a valid URL, VTT file, or a local audio file."
310
+ exit 1
311
+ fi
312
+
313
+ if [[ "$1" == *.vtt ]]; then
314
+ echo "Processing as VTT file..."
315
+ add_job "process_vtt \"$1\" \"$1\""
316
+ elif [[ "$1" == *"http"* ]]; then
317
+ echo "Processing as YouTube URL..."
318
+
319
+ # Extract the video title
320
+ VIDEO_TITLE=$(yt-dlp --get-title "$1")
321
+ FINAL_BASE_NAME=$(sanitize_filename "$VIDEO_TITLE")
322
+
323
+ # Attempt to download subtitles first
324
+ yt-dlp -N 3 --skip-download --write-auto-sub --sub-lang en \
325
+ --cookies-from-browser brave --output "$OUTPUT_DIR/${FINAL_BASE_NAME}.%(ext)s" "$1"
326
+
327
+ VTT_FILE=$(find "$OUTPUT_DIR" -name "${FINAL_BASE_NAME}.vtt" | head -n 1)
328
+
329
+ if [ -n "$VTT_FILE" ]; then
330
+ echo "Subtitles found, processing VTT file..."
331
+ add_job "process_vtt \"$VTT_FILE\" \"$1\""
332
+ else
333
+ echo "No subtitles found, downloading audio and generating transcript..."
334
+ if [ "$DISABLE_AUDIO" = false ]; then
335
+ if ! yt-dlp -N 3 -x --audio-format wav --postprocessor-args "-ar 16k" \
336
+ --cookies-from-browser brave --output "$OUTPUT_DIR/${FINAL_BASE_NAME}.%(ext)s" "$1"; then
337
+ echo "Error: Failed to download audio using yt-dlp. Check the URL and your internet connection." >&2
338
+ exit 1
339
+ fi
340
+
341
+ WAV_FILE=$(find "$OUTPUT_DIR" -name "${FINAL_BASE_NAME}.wav" | head -n 1)
342
+
343
+ if [ -z "$WAV_FILE" ]; then
344
+ echo "Error: WAV file not found after download. Check yt-dlp output." >&2
345
+ exit 1
346
+ fi
347
+
348
+ echo "Running Whisper-CPP to generate VTT transcript..."
349
+ if ! "$WHISPCC"/main -ovtt -tdrz -m "$MODEL_PATH" "$WAV_FILE"; then
350
+ echo "Error: Whisper-CPP transcription failed. Check the model path and audio file." >&2
351
+ exit 1
352
+ fi
353
+ VTT_FILE="${WAV_FILE%.*}.vtt"
354
+
355
+ add_job "process_vtt \"$VTT_FILE\" \"$1\""
356
+
357
+ # Convert WAV to OGG Opus
358
+ echo "Converting WAV to OGG Opus..."
359
+ OGG_FILE="${WAV_FILE%.wav}.ogg"
360
+ if ! ffmpeg -i "$WAV_FILE" -c:a libopus -b:a 16k -vbr on -compression_level 10 -y "$OGG_FILE"; then
361
+ echo "Error: Failed to convert to OGG format." >&2
362
+ exit 1
363
+ fi
364
+ echo " - Audio: $OGG_FILE"
365
+ # Remove the WAV file
366
+ rm "$WAV_FILE"
367
+ fi
368
+ fi
369
+ elif [ -f "$1" ]; then
370
+ echo "Processing as local audio file..."
371
+ INPUT_FILE="$1"
372
+ WAV_FILE="${INPUT_FILE%.*}.wav"
373
+
374
+ # Convert to WAV first if not already WAV
375
+ if [[ "$INPUT_FILE" != *.wav ]]; then
376
+ echo "Converting input to WAV format..."
377
+ if ! ffmpeg -i "$INPUT_FILE" -ar 16000 -ac 1 -c:a pcm_s16le ${DURATION:+-t "$DURATION"} -y "$WAV_FILE"; then
378
+ echo "Error: Failed to convert input to WAV format." >&2
379
+ exit 1
380
+ fi
381
+ else
382
+ WAV_FILE="$INPUT_FILE"
383
+ fi
384
+
385
+ echo "Running Whisper-CPP to generate VTT transcript..."
386
+ if ! "$WHISPCC"/main -ovtt -tdrz -m "$MODEL_PATH" "$WAV_FILE" ; then
387
+ echo "Error: Whisper-CPP transcription failed." >&2
388
+ exit 1
389
+ fi
390
+
391
+ VTT_FILE="${WAV_FILE%.wav}.vtt"
392
+ mv "${WAV_FILE}.vtt" "$VTT_FILE"
393
+ add_job "process_vtt \"$VTT_FILE\" \"$1\""
394
+
395
+ if [ "$DISABLE_AUDIO" = false ]; then
396
+ # Convert to OGG Opus
397
+ echo "Converting to OGG Opus..."
398
+ OGG_FILE="${WAV_FILE%.*}.ogg"
399
+ if ! ffmpeg -i "$WAV_FILE" -c:a libopus -b:a 16k -vbr on -compression_level 10 -y "$OGG_FILE"; then
400
+ echo "Error: Failed to convert to OGG format." >&2
401
+ exit 1
402
+ fi
403
+ echo " - Audio: $OGG_FILE"
404
+ # Remove the WAV file per CHARTER point 7
405
+ rm "$WAV_FILE"
406
+ fi
407
+
408
+
409
+
410
+ else
411
+ echo "Error: Invalid input. Provide a valid URL, VTT file, or a local audio file."
412
+ exit 1
413
+ fi
414
+
415
+ process_job_queue
tetris32b.html ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Tetris Game</title>
7
+ <style>
8
+ body {
9
+ display: flex;
10
+ justify-content: center;
11
+ align-items: center;
12
+ height: 100vh;
13
+ margin: 0;
14
+ background-color: #282c34;
15
+ }
16
+ canvas {
17
+ border: 1px solid #fff;
18
+ }
19
+ </style>
20
+ </head>
21
+ <body>
22
+ <canvas id="tetris" width="320" height="640"></canvas>
23
+ <script>
24
+ const canvas = document.getElementById('tetris');
25
+ const context = canvas.getContext('2d');
26
+
27
+ context.scale(20, 20);
28
+
29
+ function arenaSweep() {
30
+ let rowCount = 1;
31
+ outer: for (let y = arena.length - 1; y > 0; --y) {
32
+ for (let x = 0; x < arena[y].length; ++x) {
33
+ if (arena[y][x] === 0) {
34
+ continue outer;
35
+ }
36
+ }
37
+
38
+ const row = arena.splice(y, 1)[0].fill(0);
39
+ arena.unshift(row);
40
+ ++y;
41
+
42
+ player.score += rowCount * 10;
43
+ rowCount *= 2;
44
+ }
45
+ }
46
+
47
+ function collide(arena, player) {
48
+ const [m, o] = [player.matrix, player.pos];
49
+ for (let y = 0; y < m.length; ++y) {
50
+ for (let x = 0; x < m[y].length; ++x) {
51
+ if (m[y][x] !== 0 &&
52
+ (arena[y + o.y] &&
53
+ arena[y + o.y][x + o.x]) !== 0) {
54
+ return true;
55
+ }
56
+ }
57
+ }
58
+ return false;
59
+ }
60
+
61
+ function createMatrix(w, h) {
62
+ const matrix = [];
63
+ while (h--) {
64
+ matrix.push(new Array(w).fill(0));
65
+ }
66
+ return matrix;
67
+ }
68
+
69
+ function createPiece(type) {
70
+ if (type === 'T') {
71
+ return [
72
+ [0, 0, 0],
73
+ [1, 1, 1],
74
+ [0, 1, 0],
75
+ ];
76
+ } else if (type === 'O') {
77
+ return [
78
+ [2, 2],
79
+ [2, 2],
80
+ ];
81
+ } else if (type === 'L') {
82
+ return [
83
+ [0, 3, 0],
84
+ [0, 3, 0],
85
+ [0, 3, 3],
86
+ ];
87
+ } else if (type === 'J') {
88
+ return [
89
+ [0, 4, 0],
90
+ [0, 4, 0],
91
+ [4, 4, 0],
92
+ ];
93
+ } else if (type === 'I') {
94
+ return [
95
+ [0, 5, 0, 0],
96
+ [0, 5, 0, 0],
97
+ [0, 5, 0, 0],
98
+ [0, 5, 0, 0],
99
+ ];
100
+ } else if (type === 'S') {
101
+ return [
102
+ [0, 6, 6],
103
+ [6, 6, 0],
104
+ [0, 0, 0],
105
+ ];
106
+ } else if (type === 'Z') {
107
+ return [
108
+ [7, 7, 0],
109
+ [0, 7, 7],
110
+ [0, 0, 0],
111
+ ];
112
+ }
113
+ }
114
+
115
+ function draw() {
116
+ context.fillStyle = '#282c34';
117
+ context.fillRect(0, 0, canvas.width, canvas.height);
118
+
119
+ drawMatrix(arena, { x: 0, y: 0 });
120
+ drawMatrix(player.matrix, player.pos);
121
+ }
122
+
123
+ function drawMatrix(matrix, offset) {
124
+ matrix.forEach((row, y) => {
125
+ row.forEach((value, x) => {
126
+ if (value !== 0) {
127
+ context.fillStyle = colors[value];
128
+ context.fillRect(x + offset.x,
129
+ y + offset.y,
130
+ 1, 1);
131
+ }
132
+ });
133
+ });
134
+ }
135
+
136
+ function merge(arena, player) {
137
+ player.matrix.forEach((row, y) => {
138
+ row.forEach((value, x) => {
139
+ if (value !== 0) {
140
+ arena[y + player.pos.y][x + player.pos.x] = value;
141
+ }
142
+ });
143
+ });
144
+ }
145
+
146
+ function playerDrop() {
147
+ player.pos.y++;
148
+ if (collide(arena, player)) {
149
+ player.pos.y--;
150
+ merge(arena, player);
151
+ playerReset();
152
+ arenaSweep();
153
+ updateScore();
154
+ }
155
+ dropCounter = 0;
156
+ }
157
+
158
+ function playerMove(dir) {
159
+ player.pos.x += dir;
160
+ if (collide(arena, player)) {
161
+ player.pos.x -= dir;
162
+ }
163
+ }
164
+
165
+ function playerRotate(dir) {
166
+ const pos = player.pos.x;
167
+ let offset = 1;
168
+ rotate(player.matrix, dir);
169
+ while (collide(arena, player)) {
170
+ player.pos.x += offset;
171
+ offset = -(offset + (offset > 0 ? 1 : -1));
172
+ if (offset > player.matrix[0].length) {
173
+ rotate(player.matrix, -dir);
174
+ player.pos.x = pos;
175
+ return;
176
+ }
177
+ }
178
+ }
179
+
180
+ function rotate(matrix, dir) {
181
+ for (let y = 0; y < matrix.length; ++y) {
182
+ for (let x = 0; x < y; ++x) {
183
+ [
184
+ matrix[x][y],
185
+ matrix[y][x],
186
+ ] = [
187
+ matrix[y][x],
188
+ matrix[x][y],
189
+ ];
190
+ }
191
+ }
192
+
193
+ if (dir > 0) {
194
+ matrix.forEach(row => row.reverse());
195
+ } else {
196
+ matrix.reverse();
197
+ }
198
+ }
199
+
200
+ function playerReset() {
201
+ const pieces = 'ILJOTSZ';
202
+ player.matrix = createPiece(pieces[pieces.length * Math.random() | 0]);
203
+ player.pos.y = 0;
204
+ player.pos.x = (arena[0].length / 2 | 0) -
205
+ (player.matrix[0].length / 2 | 0);
206
+ if (collide(arena, player)) {
207
+ arena.forEach(row => row.fill(0));
208
+ player.score = 0;
209
+ updateScore();
210
+ }
211
+ }
212
+
213
+ let dropCounter = 0;
214
+ let dropInterval = 1000;
215
+
216
+ let lastTime = 0;
217
+
218
+ function update(time = 0) {
219
+ const deltaTime = time - lastTime;
220
+
221
+ dropCounter += deltaTime;
222
+ if (dropCounter > dropInterval) {
223
+ playerDrop();
224
+ }
225
+
226
+ lastTime = time;
227
+
228
+ draw();
229
+ requestAnimationFrame(update);
230
+ }
231
+
232
+ function updateScore() {
233
+ document.getElementById('score').innerText = player.score;
234
+ }
235
+
236
+ const colors = [
237
+ null,
238
+ '#FF0D72',
239
+ '#0DC2FF',
240
+ '#0DFF72',
241
+ '#F538FF',
242
+ '#FF8E0D',
243
+ '#FFE138',
244
+ '#3877FF',
245
+ ];
246
+
247
+ const arena = createMatrix(12, 20);
248
+
249
+ const player = {
250
+ pos: {x: 0, y: 0},
251
+ matrix: null,
252
+ score: 0,
253
+ };
254
+
255
+ document.addEventListener('keydown', event => {
256
+ if (event.keyCode === 37) {
257
+ playerMove(-1);
258
+ } else if (event.keyCode === 39) {
259
+ playerMove(1);
260
+ } else if (event.keyCode === 40) {
261
+ playerDrop();
262
+ } else if (event.keyCode === 81) {
263
+ playerRotate(-1);
264
+ } else if (event.keyCode === 87) {
265
+ playerRotate(1);
266
+ }
267
+ });
268
+
269
+ playerReset();
270
+ updateScore();
271
+ update();
272
+
273
+ </script>
274
+ </body>
275
+ </html>
vttclean.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+
3
+ import re
4
+ import datetime
5
+ import glob
6
+ import sys
7
+
8
+ def clean_text(text):
9
+ # Remove HTML tags
10
+ text = re.sub(r'<[^>]+>', '', text)
11
+ # Remove multiple spaces
12
+ text = re.sub(r'\s+', ' ', text)
13
+ # Remove leading/trailing whitespace
14
+ return text.strip()
15
+
16
+ def is_prefix(a, b):
17
+ return b.startswith(a)
18
+
19
+ def process_vtt(content):
20
+ # Remove WEBVTT header and metadata
21
+ content = re.sub(r'^WEBVTT\n.*?\n\n', '', content, flags=re.DOTALL)
22
+
23
+ # Split into captions
24
+ captions = re.split(r'\n\n+', content)
25
+
26
+ processed_captions = []
27
+ buffer = []
28
+
29
+ def flush_buffer():
30
+ if buffer:
31
+ processed_captions.append(buffer[-1]) # Keep the last (most complete) line
32
+ buffer.clear()
33
+
34
+ for caption in captions:
35
+ lines = caption.split('\n')
36
+ if len(lines) >= 2:
37
+ # Extract only the start time and remove milliseconds
38
+ timestamp_match = re.match(r'(\d{2}:\d{2}:\d{2})\.(\d{3})', lines[0])
39
+ if timestamp_match:
40
+ timestamp = f"{timestamp_match.group(1)}.{timestamp_match.group(2)}"
41
+ text = ' '.join(lines[1:])
42
+ clean_caption = clean_text(text)
43
+ if clean_caption:
44
+ current_line = f"{timestamp} {clean_caption}"
45
+
46
+ if not buffer:
47
+ buffer.append(current_line)
48
+ else:
49
+ _, prev_text = buffer[-1].split(' ', 1)
50
+ if is_prefix(prev_text, clean_caption):
51
+ buffer.append(current_line)
52
+ else:
53
+ flush_buffer()
54
+ buffer.append(current_line)
55
+
56
+ flush_buffer() # Don't forget to flush the buffer at the end
57
+
58
+ return '\n'.join(processed_captions)
59
+
60
+ if __name__ == "__main__":
61
+ try:
62
+ if len(sys.argv) < 2:
63
+ print("Usage: python vttclean.py <file_pattern>", file=sys.stderr)
64
+ sys.exit(1)
65
+
66
+ file_pattern = sys.argv[1]
67
+ for filename in glob.glob(file_pattern):
68
+ with open(filename, 'r', encoding='utf-8') as file:
69
+ content = file.read()
70
+ result = process_vtt(content)
71
+ print(result)
72
+ except Exception as e:
73
+ print(f"Error processing input: {e}", file=sys.stderr)
74
+ sys.exit(1)