Help:Bots/Page Extraction Script(old)

bots are scripts used to avoid massize repetative tasks for webpage manipulation. here is the main bot used for page lists, its is slow and bulky.


 * 1) !/usr/bin/perl -w


 * 1) Code for Extracting list of page types
 * 2) and publish the lists
 * 3) Creator: User:ZyMOS
 * 4) Licences: GPL
 * 1) Creator: User:ZyMOS
 * 2) Licences: GPL

use CMS::MediaWiki; use LWP::Simple; use Net::hostent; use Socket; use IO::Socket; use Time::HiRes qw(time); use Getopt::Std;
 * 1) Modules
 * 1) Modules

my $server = "1.pool.ntp.org"; my $serverIPv4 =""; if (gethostbyname($server)) { $serverIPv4 = sprintf("%d.%d.%d.%d",unpack("C4",gethostbyname($server))); } my $timeout = 2;


 * 1) Configuration Section
 * 1) Configuration Section

$host = "http://en.howto.wikicities.com/wiki/Special:Allpages"; $root = "http://en.howto.wikicities.com"; #no slash $host2 = "http://en.howto.wikicities.com/index.php?title=Special:Wantedpages&limit=500&offset=0"; $host3 = "http://en.howto.wikicities.com/index.php?title=Special%3AAllpages&from=&namespace=10"; #templates to be implemented $host4= "http://en.howto.wikicities.com/index.php?title=Special%3AAllpages&from=&namespace=12"; #

$log = "a-objectsAll.txt"; $logE = "a-objectsEmpty.txt"; $logU = "a-unknown.txt"; $logH = "a-howto.txt"; $logG = "a-guide.txt"; $logO = "a-other.txt"; $logHe = "a-help.txt"; $logHw = "a-howtoWanted.txt"; $logGw = "a-guideWanted.txt";
 * 1) Files
 * 1) Files

$unknownPage = "Help:Unknown List"; $objectPage = "Help:Objects List"; $objectEmptyPage = "Help:Empty Objects"; $howtoPage = "Help:Howto List"; $howtoWantedPage = "Help:Wanted Howtos"; $guidePage = "Help:Guide List"; $guideWantedPage = "Help:Wanted Guides"; $helpPage = "Help:Help Pages List";
 * 1) Page names
 * 1) Page names

$tmpHowto = "Template:numOfHowtos"; $tmpGuide = "Template:numOfGuides"; $tmpWantedGuide = "Template:numOfWantedGuides"; $tmpWantedHowto = "Template:numOfWantedHowtos"; $tmpWantedObject = "Template:numOfWantedObjects"; $tmpEmptyObject = "Template:numOfEmptyObjects"; $tmpObject = "Template:numOfObjects"; $tmpUnknown = "Template:numOfUnknowns"; $tmpHelp = "Template:numOfHelps";

$mes1 = "\n \nThis page contains a list of all"; $mes2 = "on WikiHowTo. \n\nFor more information See Help:Bots If you are searching for a specific subject See: Help:Portal List or Help:Object Lists\n\n\n"; $mes3 = "\n\nThis page was created by a bot and the page will be cleared and refreashed daily. You can add a link on this page and it it will be processed the next rotation";
 * 1) Messages
 * 1) Messages

my ($LocalTime0, $LocalTime0F, $LocalTime0H, $LocalTime0FH, $LocalTime0FB); my ($LocalTime1, $LocalTime2); my ($LocalTime, $LocalTimeF, $LocalTimeT); my ($NetTime, $NetTime2, $Netfraction); my ($netround, $netdelay, $off); my ($Byte1, $Stratum, $Poll, $Precision,     $RootDelay, $RootDelayFB, $RootDisp, $RootDispFB, $ReferenceIdent,      $ReferenceTime, $ReferenceTimeFB, $OriginateTime, $OriginateTimeFB,      $ReceiveTime, $ReceiveTimeFB, $TransmitTime, $TransmitTimeFB); my ($dummy, $RootDelayH, $RootDelayFH, $RootDispH, $RootDispFH, $ReferenceIdentT,     $ReferenceTimeH, $ReferenceTimeFH, $OriginateTimeH, $OriginateTimeFH,      $ReceiveTimeH, $ReceiveTimeFH, $TransmitTimeH, $TransmitTimeFH); my ($LI, $VN, $Mode, $sc, $PollT, $PrecisionV, $ReferenceT, $ReferenceIPv4); my $ntp_msg; # NTP message according to NTP/SNTP protocol specification


 * 1)           Code Section
 * 1)           Code Section


 * 1) Subroutines
 * 1) Subroutines

sub updatePage {	local($a, $b); ($a, $b) = ($_[0], $_[1]); $rc = $mw->editPage(  	  title   => "$a" ,	  section => '', 	  text    => "$b" ,	  summary => "Updated via Bot." ,	); } sub bin2frac { # convert a binary string to fraction my @bin = split '', shift; my $frac = 0; while (@bin) { $frac = ($frac + pop @bin)/2; }   $frac; } # end sub bin2frac sub frac2bin { # convert a fraction to binary string (B32) my $frac = shift; my $bin =""; while (length($bin) < 32) { $bin = $bin. int($frac*2); $frac = $frac*2 - int($frac*2); }   $bin; } # end sub frac2bin sub get_ntp_time { # open the connection to the ntp server, # prepare the ntp request packet # send and receive # take local timestamps before and after

my ($remote); my ($rin, $rout, $eout) =""; my $ntp_msg;

# open the connection to the ntp server $remote = IO::Socket::INET -> new(Proto => "udp", PeerAddr => $server,                                     PeerPort => 123,                                      Timeout => $timeout) or die "Can't connect to \"$server\"\n";

# measure local time BEFORE timeserver query $LocalTime1 = time; # convert fm unix epoch time to NTP timestamp $LocalTime0 = $LocalTime1 + 2208988800;

# prepare local timestamp for transmission in our request packet $LocalTime0F = $LocalTime0 - int($LocalTime0); $LocalTime0FB = frac2bin($LocalTime0F); $LocalTime0H = unpack("H8",(pack("N", int($LocalTime0)))); $LocalTime0FH = unpack("H8",(pack("B32", $LocalTime0FB)));

$ntp_msg = pack("B8 C3 N10 B32", '00011011', (0)x12, int($LocalTime0), $LocalTime0FB); # LI=0, VN=3, Mode=3 (client), remainder msg is 12 nulls # and the local TxTimestamp derived from $LocalTime1

# send the ntp-request to the server $remote -> send($ntp_msg) or return undef; vec($rin, fileno($remote), 1) = 1; select($rout=$rin, undef, $eout=$rin, $timeout) or do {print "No answer from $server\n"; exit};

# receive the ntp-message from the server $remote -> recv($ntp_msg, length($ntp_msg)) or do {print "Receive error from $server ($!)\n"; exit};

# measure local time AFTER timeserver query $LocalTime2 = time;

$ntp_msg;

} # end sub get_ntp_time-- sub interpret_ntp_data { # do some interpretations of the data

my $ntp_msg = shift;

# unpack the received ntp-message into long integer and binary values ( $Byte1, $Stratum, $Poll, $Precision,     $RootDelay, $RootDelayFB, $RootDisp, $RootDispFB, $ReferenceIdent,      $ReferenceTime, $ReferenceTimeFB, $OriginateTime, $OriginateTimeFB,      $ReceiveTime, $ReceiveTimeFB, $TransmitTime, $TransmitTimeFB) = unpack ("a C3  n B16 n B16 H8   N B32 N B32   N B32 N B32", $ntp_msg);

# again unpack the received ntp-message into hex and ASCII values ( $dummy, $dummy, $dummy, $dummy,     $RootDelayH, $RootDelayFH, $RootDispH, $RootDispFH, $ReferenceIdentT,      $ReferenceTimeH, $ReferenceTimeFH, $OriginateTimeH, $OriginateTimeFH,      $ReceiveTimeH, $ReceiveTimeFH, $TransmitTimeH, $TransmitTimeFH) = unpack ("a C3  H4 H4 H4 H4 a4   H8 H8 H8 H8   H8 H8 H8 H8", $ntp_msg);

$LI = unpack("C", $Byte1 & "\xC0") >> 6; $VN = unpack("C", $Byte1 & "\x38") >> 3; $Mode = unpack("C", $Byte1 & "\x07"); if ($Stratum < 2) {$sc = $Stratum;} else { if ($Stratum > 1) { if ($Stratum < 16) {$sc = 2;} else {$sc = 16;} }   }    $PollT = 2**($Poll); if ($Precision > 127) {$Precision = $Precision - 255;} $PrecisionV = sprintf("%1.4e",2**$Precision); $RootDelay += bin2frac($RootDelayFB); $RootDelay = sprintf("%.4f", $RootDelay); $RootDisp += bin2frac($RootDispFB); $RootDisp = sprintf("%.4f", $RootDisp); $ReferenceT = ""; if ($Stratum eq 1) {$ReferenceT = "[$ReferenceIdentT]";} else { if ($Stratum eq 2) { if ($VN eq 3) { $ReferenceIPv4 = sprintf("%d.%d.%d.%d",unpack("C4",$ReferenceIdentT)); $ReferenceT = "[32bit IPv4 address $ReferenceIPv4 of the ref src]"; }       else { if ($VN eq 4) {$ReferenceT = "[low 32bits of latest TX timestamp of reference src]";} }     }    }

$ReferenceTime += bin2frac($ReferenceTimeFB); $OriginateTime += bin2frac($OriginateTimeFB); $ReceiveTime += bin2frac($ReceiveTimeFB); $TransmitTime += bin2frac($TransmitTimeFB);

} # end sub interpret_ntp_data -- sub calculate_time_data { # convert time stamps to unix epoch and do some calculations on the time data

my ($sec, $min, $hr, $dy, $mo, $yr);

$ReferenceTime -= 2208988800; # convert to unix epoch time stamp $OriginateTime -= 2208988800; $ReceiveTime -= 2208988800; $TransmitTime -= 2208988800;

$NetTime = scalar(gmtime $TransmitTime); $Netfraction = sprintf("%03.f",1000*sprintf("%.3f", $TransmitTime - int($TransmitTime))); ($sec, $min, $hr, $dy, $mo, $yr) = gmtime($TransmitTime); $NetTime2 = sprintf("%04d-%02d-%02d %02d:%02d:%02d", $yr+1900, $mo+1, $dy, $hr, $min, $sec);

# calculate delay and difference $netround = sprintf("%+.4f",($LocalTime1 - $LocalTime2)); $netdelay = sprintf("%+.4f",(($LocalTime1 - $LocalTime2)/2) - ($TransmitTime - $ReceiveTime)); $off = sprintf("%+.4f",(($ReceiveTime - $LocalTime1) + ($TransmitTime - $LocalTime2))/2);

$LocalTime = ($LocalTime1 + $LocalTime2) /2; $LocalTimeF = sprintf("%03.f",1000*sprintf("%.3f", $LocalTime - int($LocalTime))); ($sec, $min, $hr, $dy, $mo, $yr) = gmtime($LocalTime); $LocalTimeT = sprintf("%04d-%02d-%02d %02d:%02d:%02d", $yr+1900, $mo+1, $dy, $hr, $min, $sec);

} # end sub calculate_time_data

$hA ="\n"; $hwA ="\n"; $gA ="\n"; $gwA ="\n"; $obA ="\n"; $obeA ="\n"; $heA ="\n"; $unA ="\n"; $hA ="\n"; $numOfObjects = 0; $numOfEmptyObjects = 0; $numOfWantedObjects = 0; $numOfHowtos = 0; $numOfWantedHowtos = 0; $numOfGuides = 0; $numOfWantedGuides = 0; $numOfUnknownPages = 0; $numOfHelpPages = 0;
 * 1) clear vars
 * 1) clear vars

open(OUTF,">$log"); open(OUTFE,">$logE"); open(OUTG,">$logG"); open(OUTH,">$logH"); open(OUTU,">$logU"); open(OUTO,">$logO"); open(OUTHe,">$logHe"); open(OUTHw,">$logHw"); open(OUTGw,">$logGw");
 * 1) Open Files
 * 1) Open Files

$mw = CMS::MediaWiki->new(       host  => 'en.howto.wikicities.com',        path  => 'wiki',     #  Can be empty on 3rd-level domain Wikis        debug => 1            #  0=no debug msgs, 1=some msgs, 2=more msgs ); $rc = $mw->login( user => 'Bot-Admin-ZyMOS', pass => 'crackers' );
 * 1) Login
 * 1) Login


 * 1) connect to timeserver
 * 1) connect to timeserver

$ntp_msg = get_ntp_time; interpret_ntp_data($ntp_msg); if (($LocalTime0H. $LocalTime0FH) ne ($OriginateTimeH. $OriginateTimeFH)) { print "*** The received reply seems to be faulty and NOT the reply to our request packet:\n"; print "*** The OriginateTime stamp $OriginateTimeH.$OriginateTimeFH of the received packet does not \n"; print "*** show our Transmit Time $LocalTime0H.$LocalTime0FH.\n"; exit; }

calculate_time_data;

$content = get($host); $content2 = get($host2);
 * 1) Dload page lists
 * 1) Dload page lists

@list = split(/\"/,$content); $x=0; foreach $chunk (@list){ if($chunk =~ /\/wiki\// && !($chunk =~ /.wikicities.com/)){	$linklist[$x] = $chunk;	$x++;  }else{	#print "#";  } }
 * 1) Extract page names
 * 1) Extract page names

$testlist[0] = "/wiki/bzip2"; $testlist[1] = "/wiki/Apt_command"; foreach $link (@linklist){ $content = 0; $linkit = substr($link,6); if($linkit =~ /^Help:/i || $linkit =~ /^About:/i || $linkit =~ /^HowTo_Wiki:/i || $linkit =~ /(Main_Page|Current_events|Disclaimer|Policy)/i  ){ #### Help pages print "H $linkit\n"; print OUTHe "# $linkit\n"; $heA =$heA. "# $linkit \n"; $numOfHelpPages++; } elsif($linkit =~ /:/ || $linkit =~ /^(Wikihowto|Old_page|Scrpt)/i || $linkit =~ /(List|Wikisolutions|Wikicities|sandbox)/i ){ #### Other pages print "O $linkit\n"; print OUTO "# $linkit\n"; $otherA =$otherA. "# $linkit \n"; }elsif($linkit =~ /^Howto/i || $linkit =~ /^How to/i || $linkit =~ /^How_to/i ){ #### howto Pages print "* $linkit\n"; print OUTH "# $linkit\n"; $hA =$hA. "# $linkit \n"; $numOfHowtos++; }elsif($linkit =~ /^Guide/i){ #### Guides print "\@ $linkit\n"; print OUTG "# $linkit\n"; $gA =$gA. "# $linkit \n"; $numOfGuides++; }else{ #### All Objects and Unknown pages $content = get("$root$link"); if($content =~ /object:?.?.?.?.?.?.? .a href=\"http:\/\/en.wikipedia.org\/wiki\//i){      # this need improved!!!!!!!!!!!!  	print "# $linkit";	print OUTF "# $linkit\n";	$obA =$obA . "# $linkit \n";		$numOfObjects++;				if($content =~ /object is a currently empty/){  #### Empty Object Pages	  print OUTFE "# $linkit\n";	  print " EMPTY";		$obeA =$obeA . "# $linkit \n";		$numOfEmptyObjects++;			}    print "\n";    }elsif ($linkit !~ /:/i){  #### Unknown pages	  print "? $linkit\n";	 print OUTU "# $linkit\n";	  $unA =$unA . "# $linkit \n";		$numOfUnknownPages++;	  	}  } }
 * 1) Extract page types
 * 1) Extract page types

@scraps = split(//,$scrap) ; foreach $scope (@scopes){ ##### wanted howto pages if($scope =~ /^Howto/i || $scope =~ /^How to/i){ print "* $scope\n"; print OUTHw "# $scope\n"; $hwA =$hwA. "# $scope \n"; $numOfWantedHowtos++; }elsif($scope =~ /^Guide/i){ ##### Wanted Guide Pages print "G $scope\n"; print OUTGw "# $scope\n"; $gwA =$gwA. "# $scope \n"; $numOfWantedGuides++; }  } }
 * 1) Extract wanted pages
 * 1) Extract wanted pages

close(OUTHw); close(OUTGw); close(OUTF); close(OUTFE); close(OUTU); close(OUTG); close(OUTH); close(OUTO); print "\n";
 * 1) Close Files
 * 1) Close Files

&updatePage($guidePage,"$mes1 Guides $mes2$gA $mes3"); &updatePage($guideWantedPage,"$mes1 Wanted Guides $mes2$gwA $mes3"); &updatePage($howtoPage,"$mes1 Howto Pages $mes2$hA $mes3"); &updatePage($howtoWantedPage,"$mes1 Wanted Howto Pages $mes2$hwA $mes3"); &updatePage($objectPage,"$mes1 Objects $mes2$obA $mes3"); &updatePage($objectEmptyPage,"$mes1 Empty Objects $mes2$obeA $mes3"); &updatePage($unknownPage,"$mes1 Unknown Pages $mes2$unA $mes3"); &updatePage($helpPage,"$mes1 Help Pages $mes2$heA $mes3");
 * 1)  Update Pages
 * 1)  Update Pages

=dfdsfs $tmpHowto = "Template:numOfHowtos"; $tmpGuide = "Template:numOfGuides"; $tmpWantedGuide = "Template:numOfWantedGuides"; $tmpWantedHowto = "Template:numOfWantedHowtos"; $tmpWantedObject = "Template:numOfWantedObjects"; $tmpEmptyObject = "Template:numOfEmptyObjects"; $tmpObject = "Template:numOfObjects"; $tmpUnknown = "Template:numOfUnknowns"; $tmpHelp = "Template:numOfHelps"; $numOfObjects = 0; $numOfEmptyObjects = 0; $numOfWantedObjects = 0; $numOfHowtos = 0; $numOfWantedHowtos = 0; $numOfGuides = 0; $numOfWantedGuides = 0; $numOfUnknownPages = 0; $numOfHelpPages = 0; =cut
 * 1)  Making templates for number of pages
 * 1)  Making templates for number of pages

&updatePage($tmpHowto,$numOfHowtos); &updatePage($tmpWantedHowto,$numOfWantedHowtos); &updatePage($tmpGuide,$numOfGuides); &updatePage($tmpWantedGuide,$numOfWantedGuides); &updatePage($tmpObject,$numOfObjects); &updatePage($tmpWantedObject,$numOfWantedObjects); &updatePage($tmpEmptyObject,$numOfEmptyObjects); &updatePage($tmpUnknown,$numOfUnknownPages); &updatePage($tmpHelp,$numOfHelpPages);

&updatePage("Template:NumUpdateDate",$NetTime);