User:UninvitedCompany/user script

steps

 * 1) Download the split files for en.old from http://download.wikimedia.org.  As the database grows there occasionally come to be more split files than there are links.  Therefore, after downloading e.g. xaa, xab, xac, xad, xae through the links it also may be necessary to download e.g. xaf and xag.
 * 2) Compile wikitrunc.c with gcc
 * 3) $ cat xa? | bunzip2 | wikitrunc > trunc_old.sql
 * 4) $ mysql --database=test --user=administrator --password=
 * 5) At the sql prompt, @trunc_old.sql
 * 6) Allow a week or more for a typical pc.
 * 7) Run the pre.sql script  (14 hours)
 * 8) Cut and paste Special:Listadmins to a local file named adminlist.txt.
 * 9) Edit the post.sql script to change minimums (if desired) and run it; you get a file called "list"
 * 10) Edit the fmtlist.c to change the cutoff date, compile with gcc
 * 11) fmtlist list.wiki
 * 12) Paste to Another list of Wikipedians in order of arrival

wikitrunc.c
#include 

myerror( char *str ) {   fprintf( stderr, "%s\n", str ); exit( 1 ); }

main {   static char values[] = "VALUES "; int state = 0, c, i;

while( state < 7 && ( c = getchar ) != EOF ) { putchar( c ); if( values[ state ] == c ) state++; else state = 0; }

while ( c != EOF ) { do { // want (           if( ( c = getchar ) != '(' ) myerror( "format error, expecting (" );           putchar( c );

// want ..., ... , ... ,           for( i=0; i<3; i++ ) copyfield;

truncfield;

while( (c=copyfield) != ')' && c != EOF )               ;

if( c != ')' )               myerror( "expecting )" );

c = getchar;     // , putchar( c ); } while( c == ',' );

// ; INSERT INTO old VALUES (

state = i = 0; while( state < 7 && ( c = getchar ) != EOF ) { putchar( c ); if( values[ state ] == c ) state++; else { state = 0; if( i++ > 100 ) error( "garbage looking for INSERT INTO old VALUES" ); }       }    }

// must be some trailing stuff, what to do? //   //    fprintf( stderr, "last char was %c (%d)\n", c, c );

for( i=100; i--; ) fprintf( stderr, "%c", getchar );

return 0; }

copyfield {   int c;

if( (c=getchar) == '\'' ) { putchar( c ); do { if( (c = getchar) == '\\' ) { putchar( c ); putchar( getchar ); } else { putchar( c ); }       } while( c != '\'' ); } else putchar( c );

while( c != EOF && c != ',' && c != ')' ) {       c = getchar;        putchar( c );    }

return c; }

truncfield {   int c;    int maxcopy = 40;

if( (c=getchar) == '\'' ) { putchar( c ); do { if( (c = getchar) == '\\' ) { if( maxcopy > 0 ) { putchar( c ); putchar( getchar ); } else getchar; } else { if( maxcopy > 0 || c == '\'' ) { putchar( c ); maxcopy--; }           }        } while( c != '\'' ); } else putchar( c );

while( c != EOF && c != ',' && c != ')' ) {       c = getchar;        putchar( c );    }

return c; }

pre.sql
drop table if exists edit_months; drop table if exists track_users; create table track_users as select count(old_user_text) as total_edits, old_user_text from old group by old_user_text having total_edits > 199; # takes 32 minutes

delete from track_users where old_user_text like '%.%.%.%'; delete from track_users where total_edits < 200; # leaves about 2500 users of interest

create table edit_months as select count(floor( old_timestamp/100000000 )) as month_edits, floor( old_timestamp/100000000 ) as yyyymm, old.old_user_text as user_text, total_edits from old, track_users where old.old_user_text = track_users.old_user_text group by yyyymm, user_text having month_edits > 4;


 * 1) 14 hours 30 minutes

post.sh
cat > adminadd.sql <>adminadd.sql

cat >> adminadd.sql <<eof ('bogus last record') ;

delete from adminlist where user_text = 'bogus last record'; eof

mysql --database=test --user=administrator --password= <list < 1000 and months_active > 5) or admin='Y' order by yyyymm_first, user_text; eof

rm adminadd.sql

fmtlist.c
main {   static char *months[] = { "", "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December" }; char buf[ 500 ]; gets( buf );   // throw away header line

char name[ 500 ], last[ 10 ], first[ 10 ], active[ 10 ], edits[ 10 ], *p, *q; int admin;

char savefirst[ 10 ]; savefirst[0] = '\0';

printf( "== About this list ==\n"   "This list was compiled using an automated script.\n"    "A manually-maintained list that includes "    "some interesting commentary but that does not include all users is also available.\n"    "\n"    "The purpose of this list is to document the contributions of early contributors,"    "to draw attention to those prolific contributors who have left the project so that we may better "    "retain contributors in the future, \n"    "and to serve as a resource for those who wish to better understand social trends at Wikipedia.\n\n"    "The list includes Wikipedians who made more than 1000 edits and were active "    "for more than six months."    "Administrators are included for comparison even if they do not meet the " "edit count and acvity criteria.\n" "Edit counts are shown in parenthesis. Administrators are indicated with a *.\n" "Wikipedians are listed as inactive when they no longer make at least five edits per month.\n" "Anonymous contributors are not shown.\n" "\n" "The list is based on data as of October 10, 2004. Very recent edits -- those marked (top) in a user's " "contribution list -- are not included because of technical limitations of the script.\n\n" );

while( gets( buf ) ) { for( p=buf, q=name; *p && *p != '\t'; p++ ) *q++ = *p; *q = '\0'; p++; for( q=first; *p && *p != '\t'; p++ ) *q++ = *p; *q = '\0'; p++; for( q=last; *p && *p != '\t'; p++ ) *q++ = *p; *q = '\0'; p++; for( q=active; *p && *p != '\t'; p++ ) *q++ = *p; *q = '\0'; p++; for( q=edits; *p && *p != '\t'; p++ ) *q++ = *p; *q = '\0'; p++; admin = *p == 'Y';

if( strcmp( first, savefirst ) != 0 ) { printf( "\n== %s %.4s ==\n",                   months[ atoi(first+4) ], first ); strcpy( savefirst, first ); }

if( admin ) printf( "#. %s*", name, name ); else printf( "#. %s", name, name );

printf( " (%s", edits );       printf( ")" );

if( atoi( last ) < 200408 ) printf( " - inactive since %s %.4s",                   months[ atoi(last+4) ], last ); printf( "\n" ); } }