Search:  
Gentoo Wiki

Ucpt:Extract.pl

This script parses the Gentoo-Wiki's sql-database dump files to create the UCPT ebuilds, files, structure, and index. The daily backup files needed for extraction can be found here

Code: extract.pl
#!/usr/bin/perl # # ebuild and file extraction script for the UCPT database on # www.gentoo-wiki.info. See http://www.gentoo-wiki.info/UCPT. # # UCPT - User-Contributed Portage Tree ## # Usage: download page_table.sql.gz and text_table.sql.gz and # run 'perl extract.pl' where you have put the gzipped sql dump files. # # Note: does not remove 'stale' files (ie. those which have been # removed from the Wiki) ## # You need to be in the 'portage' user group in order to # let this script build the digests for you. # $create_digests = 1;# create Wiki index pages for UCPT namespace (eg. www.gentoo-wiki.info/Index:UCPT)? $create_index_pages = 1; $index_page_filename = "UCPT_index_page.txt"; $ebuild_index_page_filename = "UCPT_ebuild_index_page.txt"; @portage_categories = (); undef %ebuild_wikinames_hash; # for ebuild index creation undef %descriptions_by_wikiname;# The mediawiki namespace ID ("Ucpt:" on www.gentoo-wiki.info has ID 110) $mynamespace = 110; $mynamespace_str = "Ucpt:"; # where to create the user-contributed portage tree? $basedir = ".";# # Create the digest files after having created _all_ ebuilds. # It's not really necessary to do that afterwards. # @ebuild_files = ();$error_ret_code = 0;# # Read in relevant page names, their dates and ids. # print "Parsing page_table...\n"; undef %wikiname_by_revid; open PAGEDB, "gzip -dc page_table.sql.gz |"; $page_count = 0; while (<PAGEDB>) { chomp; foreach (split(/\),\(/)) { if (s/[0-9]+,$mynamespace,\'([^\'\/]+\/[^\'\/]+\/[^\']+)\',\'.*\',[0-9]+,[0-9]+,[0-9]+,[0-9\.]+,\'([0-9]+)\',([0-9]+),[0-9]+/$revid=$3;$wikiname=$1;""/ei) { #print "Detected UCPT page: $wikiname ($revid)\n"; $wikiname_by_revid{$revid} = $wikiname; $page_count++; } } } close PAGEDB; print "Detected $page_count pages.\n";print "Parsing revision table...\n"; undef %wikiname_by_pageid; undef %wikidate_by_pageid; open REVDB, "gzip -dc revision_table.sql.gz |"; $page_count = 0; while(<REVDB>) { chomp; foreach (split(/\),\(/)) { if(s/([0-9]+),([0-9]+),'.*',[0-9]+,'.*','([0-9]+)',[0-9]+,[0-9]+,([0-9]+)/$revid=$1;$wikidate=$3;$pageid=$4;""/e && defined($wikiname_by_revid{$revid})) { $wikiname_by_pageid{$pageid} = $wikiname_by_revid{$revid}; $wikidate_by_pageid{$pageid} = $wikidate; $page_count++; } } } close REVDB; print "Found $page_count revisions.\n";undef %wikiname_by_revid;# # Scan text db... # print "Parsing text_table...\n"; open TEXTDB, "gzip -dc text_table.sql.gz |"; while (<TEXTDB>) { chomp; foreach (split(/\),\(/)) { $wikiname = ""; if(s/([0-9]+),\'/$pageid=$1;""/ei && ($wikiname = $wikiname_by_pageid{$pageid})) { $content = ""; s/^(.*)\',\'[^\']*\'/$content=$1;""/ei; $date = substr($wikidate_by_pageid{$pageid},0,12); substr($wikiname,0,1) =~ tr/[A-Z]/[a-z]/; # remember portage categories and ebuild wikinames for index page creation remember_portage_categories_and_ebuild_wikinames($wikiname); # create file if(($wikiname =~ /\.ebuild$/) || ($wikiname =~ /^[a-zA-Z0-9]+-[a-zA-Z0-9]+\/[^\/]+\/files\//)) { my $is_ebuild = 0; # # use date of wiki post as release tag # -- not nice but simple and efficient # $filename = $wikiname; if ( $filename =~ s/\.ebuild$/-r$date.ebuild/ ) { $is_ebuild = 1; } my $sysfilename = $basedir."/".$filename; # # File content is enclosed in <pre> and </pre> tags. # Outside of these tags, one may place comments which will # only show up on the Wiki, but make sure there is only one # <pre> and only one </pre> tag in the whole Wiki text! # $content =~ s/^.*<pre>(.*)<\/pre>.*$/$1/i; # # Convert back escaped chars... # FIXME: need to do proper unescaping during # parsing of the SQL command! # $content =~ s/\\n/\n/g; $content =~ s/\\"/"/g; $content =~ s/\\'/'/g; $content =~ s/\\\\/\\/g; $is_ebuild and store_ebuild_description($wikiname,$content); # # keep last-modified time of existing _ebuild_ files # if ( ! $is_ebuild || ! -f "$sysfilename" ) { print "$filename\n"; #print "sysfilename=$sysfilename\n"; my $sysdirname = $sysfilename; if($sysdirname =~ s/\/[^\/]+$//) { #print "sysdirname=$sysdirname\n"; system("mkdir -p \"$sysdirname\""); } open FILE, "> $sysfilename"; print FILE $content; close FILE; { my $portage_category = $filename; $portage_category =~ s/^([^\/]+)\/.*$/$1/; my ($main,$sub) = split ( '-', $portage_category, 2 ); if ( defined($portage_categories{$main}) ) { $portage_categories{$main} .= ",".$sub; } else { $portage_categories{$main} = $sub; } } } # # Remember ebuild filename for digest creation # if ( $is_ebuild ) { push @ebuild_files, $sysfilename; } } } } } close TEXTDB; print "Extraction done.\n";# # create digests # while ( $ebuild_filename = pop @ebuild_files ) { # # digest creation necessary? # my $digest_filename = $ebuild_filename; if ( $digest_filename =~ s/\/([^\/]+)\.ebuild$/\/files\/digest-$1/ ) { if ( ! -f "$digest_filename" ) { $digest_filename =~ s/files\/digest-[^\/]+$/Manifest/; system ( "echo > \"$digest_filename\"" ) if ( ! -f "$digest_filename" ); system ( "ebuild \"$ebuild_filename\" manifest" ) == 0 or $error_ret_code = 1; } } else { print " * * * * ****** ERROR! ****** * * * *\n"; $error_ret_code = 1; } }# # create Wiki index pages: # 1. portage categories # 2. portage categories containing all ebuilds # if ( $create_index_pages ) { my $file_handle; open $file_handle, "> $index_page_filename"; my $ebuild_index_handle; open $ebuild_index_handle, "> $ebuild_index_page_filename";# # get sorted and unique list of main/major portage categories # my @tmp_main_categories = (); my %main_existence_hash; my $i; for ( $i = 0; $i <= $#portage_categories; $i++ ) { my ($main,$sub) = split ( '-', $portage_categories[$i], 2 ); if ( ! defined($main_existence_hash{$main}) ) { push @tmp_main_categories, $main; $main_existence_hash{$main} = 1; } } undef %main_existence_hash; my @sorted_unique_main_categories = sort @tmp_main_categories; #print join(":",@sorted_unique_main_categories)."\n"; for ( $i = 0; $i <= $#sorted_unique_main_categories; $i++ ) { my $main_category = $sorted_unique_main_categories[$i]; print $file_handle "* $main_category\n"; print $ebuild_index_handle "* $main_category\n"; # # get sorted and unique list of sub/minor portage categories # my @tmp_sub_categories = (); my %sub_existence_hash; my $j; for ( $j = 0; $j <= $#portage_categories; $j++ ) { my ($main,$sub) = split ( '-', $portage_categories[$j], 2 ); if ( $main_category eq $main && !defined($sub_existence_hash{$sub}) ) { push @tmp_sub_categories, $sub; $sub_existence_hash{$sub} = 1; } } undef %sub_existence_hash; my @sorted_unique_sub_categories = sort @tmp_sub_categories; #print join(":",@sorted_unique_sub_categories)."\n"; my $k; for ( $k = 0; $k <= $#sorted_unique_sub_categories; $k++ ) { my $sub_category = $sorted_unique_sub_categories[$k]; print $file_handle "** $sub_category\n"; print $ebuild_index_handle "** $sub_category\n"; my @ebuild_wikinames = split ( "\t", $ebuild_wikinames_hash{$main_category.'-'.$sub_category} ); my @sorted_ebuild_wikinames = sort @ebuild_wikinames; my $l; for ( $l = 0; $l <= $#sorted_ebuild_wikinames; $l++ ) { my $wikiname = $sorted_ebuild_wikinames[$l]; my $nice_wikiname = $wikiname; $nice_wikiname =~ s/^.*\/([^\/]+)\.ebuild$/$1/; print $ebuild_index_handle '*** [['.$mynamespace_str.$wikiname.'|'. $nice_wikiname.']] '.$descriptions_by_wikiname{$wikiname}."\n"; } } print $file_handle "\n"; print $ebuild_index_handle "\n"; } close $ebuild_index_handle; close $file_handle; }if ( $error_ret_code == 0 ) { print "All OK.\n"; } elsif ( $error_ret_code == 1 ) { print "There were errors during the creation of the digest files!\n"; }exit $error_ret_code;sub remember_portage_categories_and_ebuild_wikinames { # categories my $portage_category = @_[0]; $portage_category =~ s/^([^\/]+)\/.*$/$1/; push @portage_categories, $portage_category; # ebuild wikinames my $name = @_[0]; if ( $name =~ /\.ebuild$/ ) { if(!defined($ebuild_wikinames_hash{$portage_category})) { $ebuild_wikinames_hash{$portage_category} = $name; } else { $ebuild_wikinames_hash{$portage_category} .= "\t".$name; } } }sub store_ebuild_description { my ($wikiname,$content) = @_; my $desc = ""; #print "* $wikiname\n"; if($content =~ s/(^|\n)\s*DESCRIPTION\s*=\s*"(.+)"(\s*|\s*#.*)(\n|$)/$desc=$2;""/ei) { #print "** $desc\n"; $descriptions_by_wikiname{$wikiname} = $desc; } }
Retrieved from "http://www.gentoo-wiki.info/Ucpt:Extract.pl"

Last modified: Sun, 17 Dec 2006 15:33:00 +0000 Hits: 4,593