Patching Dec 9, 2021 6-7a CST- All GitLab services may be unavailable for 5-10 minutes

Commit 5920b862 authored by Roland Haas's avatar Roland Haas
Browse files

parallel_extract.pl: proof of concept of parallel extractor

parent 6ae92dd1
#!/usr/bin/perl
use strict;
use warnings;
use POSIX::2008 qw(preadv); # read without updating file pointer
# there is an incompatible change in 0.13:
# --8<--
# Since version 0.13, the parameter order of pread/pwrite is (count, offset) as
# with the actual system calls instead of (offset, count). Good luck.
# --8<--
# and since I don't care about detecting the version in use, I use preadv
# instead which does not change its order of arguments.
# proof of concenpt parallel extract using GNU tar and Perl IPC
#
# this script takes a index file like the one written by mpitar, possible with
# some lines removed and extracts only the listed files using up to MAX-PROCS
# tar processes
if(scalar @ARGV != 3) {
print STDERR "usage: NAX-PROCS data.tar.idx data.tar\n";
exit 1;
}
# max no. of parallel tar instances
my $max_running = $ARGV[0];
# which files to extract
my %files;
open(my $files_fh, "<", $ARGV[1]) or die "Could not open $ARGV[1]: $!";
while(<$files_fh>) {
chomp;
m/([0-9A-Fa-f]*) (.*)/;
$files{int($1)} = $2;
}
close($files_fh);
# do work
open(my $tar_fh, "<", $ARGV[2]) or die "Could not open $ARGV[2]: $!";
binmode $tar_fh;
# iterate over all requested files and start a tar process for them at the
# right spot in the file
my %pids;
my %done;
foreach my $off (sort {$a <=> $b} keys %files) {
$files{$off} =~ m!^(.*)/[^/]*$!;
my $dirpart = $1;
# if this file needs a directory or similar in its path that does not yet
# exist or is not yet created, wait for this to happen. This is not ideal but
# for a proof of concept good enough.
my $have_to_wait = scalar keys %pids >= $max_running;
if (defined $dirpart) {
if (not exists $done{$dirpart}) {
if (not -e $dirpart) {
$have_to_wait = 1;
} else {
# already exists in file system, remember this
$done{$dirpart} = 1;
}
}
}
if ($have_to_wait) {
my $pid = wait();
die "No child processes even though there should be" if $pid == -1;
die "extracting '$pids{$pid}' failed with error code $?" if $? != 0;
$done{$pids{$pid}} = 1;
delete $pids{$pid};
redo;
}
my $pid = fork();
die "Fork faild: $!" if not defined $pid;
# tar command to extract single file from stdin
if ($pid) { # parent
$pids{$pid} = $files{$off};
} else { # child
# get amount of data for this file
# TODO: this could be derived from the offsets if I have a full index file
my @heads;
preadv($tar_fh, @heads, [512], $off) or die $!;
my $head = $heads[0];
my $sz = oct(substr($head, 124, 12));
my $typeflag = substr($head, 156, 1);
if($typeflag eq "x") {
# found extended header, which may contain a file size
my @extheads;
preadv($tar_fh, @extheads, [$sz], $off+512) or die $!;
my $exthead = $extheads[0];
my $pax_sz = ($sz + 511) & ~511;
$sz = undef;
# check if the extentded records contain a size and use that iffound
foreach my $record (split '\n', $exthead) {
$record =~ m/\s*(\d+)\s+(\w+)=(.*)/ or
die "$record does not look like a extended header record.";
if($2 eq "size") {
$sz = int($3);
}
}
unless(defined($sz)) {
# no size record, read file size from actual (next) header
preadv($tar_fh, @heads, [512], $off+512+$pax_sz) or die $!;
$head = $heads[0];
$sz = oct(substr($head, 124, 12));
}
$sz += 512 + $pax_sz;
}
my $rnd_sz = 512 + (($sz + 511) & ~511); # size rounded up plus tar header
# copy data to tar for extraction
my @tar = ("tar", "-x", "--occurrence=1", $files{$off});
open(my $tarpipe, "|-", join(" ", @tar)) or die "Failed to exec ".join(" ",@tar).": $!";
binmode $tarpipe;
my $bufsz = 1024*1024; # read in 1MiB chunks
while ($rnd_sz > 0) {
my $chunk_sz = $rnd_sz > $bufsz ? $bufsz : $rnd_sz;
my @bufs;
preadv($tar_fh, @bufs, [$chunk_sz], $off) or die $!;
$off += $chunk_sz;
$rnd_sz -= $chunk_sz;
syswrite($tarpipe, $bufs[0], $chunk_sz) or die $!;
}
my $term = "\0" x 1024;
syswrite($tarpipe, $term);
close($tarpipe);
exit $?;
}
}
# wait for all tar to finish
while (keys %pids) {
my $pid = wait();
die "No child processes even though there should be" if $pid == -1;
die "extracting '$pids{$pid}' failed with error code $?" if $? != 0;
$done{$pids{$pid}} = 1;
delete $pids{$pid};
}
close($tar_fh);
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment