/ -> cron.php
1 <?php
2
3 // cron.php - Maintance tasks. To be called from cron.
4 // cron.php - author: Nico Stuurman <nicost@sourceforge.net>
5
6 /***************************************************************************
7 * Copyright (c) 2002 by Nico Stuurman *
8 * ------------------------------------------------------------------------ *
9 * This program is free software; you can redistribute it and/or modify it *
10 * under the terms of the GNU General Public License as published by the *
11 * Free Software Foundation; either version 2 of the License, or (at your *
12 * option) any later version. *
13 \**************************************************************************/
14
15
16 // This might take a while:
17 ini_set("max_execution_time","0");
18
19 include ("includes/defines_inc.php");
20 include ("includes/functions_inc.php");
21 include ("includes/init_inc.php");
22 include ("includes/db_inc.php");
23
24 ////
25 // !Writes the index files needed for full text searches of files
26 function doindexfile ($db,$filetext,$fileid,$indextable,$recordid,$pagenr)
27 {
28 if (!$pagenr)
29 $pagenr=1;
30 $thetext=split("[ ,.:;\"\n]",$filetext);
31 foreach ($thetext as $word) {
32 if (strlen($word)>3) {
33 $r=$db->Execute("SELECT id FROM words WHERE word='$word'");
34 $wordid=$r->fields[0];
35 if (!$wordid) {
36 $wordid=$db->GenID("word_seq");
37 $db->Execute("INSERT INTO words VALUES ($wordid,'$word')");
38 }
39 $db->Execute("INSERT INTO $indextable VALUES ($wordid,$fileid,$pagenr,$recordid)");
40 }
41 }
42 return true;
43 }
44
45
46 // main body
47
48 // text/html files are indexed directly, pdf files are first converted
49 // with ghostscript
50 // words are entered in table 'words'
51 // links between files/records are kept in specifc tables
52
53 // we keep track of the time it takes to do the indexing
54 $starttime=microtime();
55
56 $host=getenv("HTTP_HOST");
57 if (! ($host=="localhost" ||$host=="127.0.0.1") ) {
58 echo "This script should only be called by the CRON daemon.";
59 exit ();
60 }
61
62 $gs=$system_settings["gs"];
63 if (!@is_readable($gs))
64 echo "Could not read ghostscipt binary (gs) at '$gs'.<br>";
65
66
67 // find unindexed files with mime types we can work with
68 $rfiles=$db->Execute("SELECT id,filename,tablesfk,ftableid,mime,ftablecolumnid FROM files WHERE indexed IS NULL AND (mime LIKE '%text%' OR mime LIKE '%pdf%')");
69
70 while ($rfiles && !($rfiles->EOF)) {
71
72 // find out to which table we are going to write the index
73 $rdesc=$db->Execute("SELECT table_desc_name FROM tableoftables WHERE id=".$rfiles->fields[tablesfk]);
74 if($rdesc->fields[table_desc_name]) {
75 $rindextable=$db->Execute("SELECT associated_table FROM ".$rdesc->fields[table_desc_name]." WHERE id=".$rfiles->fields[ftablecolumnid]);
76 if ($rindextable->fields[associated_table]) {
77 // treat text files and pdf files differently
78 if (strstr($rfiles->fields[mime],"text")) {
79 $fp=fopen(file_path($db,$rfiles->fields[id]),"r");
80 if ($fp) {
81 while (!feof($fp)) {
82 $filetext.=fgetss($fp,64000);
83 }
84 fclose($fp);
85 }
86 $filetext=strtolower($filetext);
87 if (doindexfile ($db,$filetext,$rfiles->fields[id],$rindextable->fields[associated_table],$rfiles->fields[ftableid],1)) {
88 $db->Execute ("UPDATE files SET indexed=1 WHERE id=".$rfiles->fields[id]);
89 $textfilecounter++;
90 }
91 }
92 // for pdf files we use ghostscript. Part of this code was taken from docmgr
93 elseif (strstr($rfiles->fields[mime],"pdf") && $gs) {
94 //first we have to figure out how many pages
95 //are in the file. this is a rough method.
96 //we have gs kick up an error after it opens
97 //the file and sees how many pages there are
98
99 $filepath=file_path($db,$rfiles->fields[id]);
100 $numpages = `$gs -dNODISPLAY "$filepath" -c quit`;
101 $pos1 = strpos($numpages,"through");
102 $numpages = substr($numpages,$pos1);
103 $pos2 = strpos($numpages,".");
104 $numpages= trim(substr($numpages,8,$pos2-8));
105
106 for ($page=1;$page<=$numpages;$page++) {
107 //gs the page and return as a string
108 $tempstring=`$gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -dFirstPage=$page -dLastPage=$page -c save -f ps2ascii.ps "$filepath" -c quit`;
109 //strip out all the trash from the string
110 //$tempstring = string_clean($tempstring,$preventIndex,$keepIndex);
111 $filetext=strtolower($tempstring);
112 doindexfile ($db,$filetext,$rfiles->fields[id],$rindextable->fields[associated_table],$rfiles->fields[ftableid],$page);
113 }
114 $db->Execute ("UPDATE files SET indexed=1 WHERE id=".$rfiles->fields[id]);
115 $pdffilecounter++;
116 }
117 }
118 }
119 echo 'Indexed file: '.$rfiles->fields['filename'].'.<br>';
120 $rfiles->MoveNext();
121 }
122
123 // The rest just serves to report some statistics..
124 if (!$textfilecounter)
125 $textfilecounter=0;
126 if (!$pdffilecounter)
127 $pdffilecounter=0;
128 $endtime=microtime();
129 list($startmu,$starts)=explode(" ",$starttime);
130 list($endmu,$ends)=explode(" ",$endtime);
131 $process=$ends-$starts;
132 $procesmu=$endmu-$startmu;
133 $pt=$process+$procesmu;
134 $ptime=sprintf("%0f",$pt);
135
136 echo "Indexed $textfilecounter text files and $pdffilecounter pdf files in $ptime seconds<br>";
137
138 // load plugin php code if it has been defined
139 if ($HTTP_GET_VARS[tablename]) {
140 $tableinfo=new tableinfo($db);
141 $plugin_code=get_cell($db,"tableoftables","plugin_code","id",$tableinfo->id);
142 if ($plugin_code) {
143 @include($plugin_code);
144 // and execute the cron plugin
145 if (function_exists("plugin_cron"))
146 plugin_cron($db,$tableinfo);
147 }
148 }
149
150 // we'll do the postgres maintenance
151 if (substr($db_type,0,8)=='postgres') {
152 $db->Execute('VACUUM');
153 $db->Execute('ANALYZE');
154 $db->Execute('VACUUM ANALYZE');
155 echo "Finished postgres maintenance.<br>";
156 }
157 ?>