001/**
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.activemq.store.kahadb.scheduler;
018
019import java.io.DataInput;
020import java.io.DataOutput;
021import java.io.File;
022import java.io.FilenameFilter;
023import java.io.IOException;
024import java.util.ArrayList;
025import java.util.Collection;
026import java.util.HashMap;
027import java.util.HashSet;
028import java.util.Iterator;
029import java.util.List;
030import java.util.Map;
031import java.util.Map.Entry;
032import java.util.Set;
033import java.util.TreeSet;
034import java.util.UUID;
035
036import org.apache.activemq.broker.scheduler.JobScheduler;
037import org.apache.activemq.broker.scheduler.JobSchedulerStore;
038import org.apache.activemq.protobuf.Buffer;
039import org.apache.activemq.store.kahadb.AbstractKahaDBStore;
040import org.apache.activemq.store.kahadb.JournalCommand;
041import org.apache.activemq.store.kahadb.KahaDBMetaData;
042import org.apache.activemq.store.kahadb.Visitor;
043import org.apache.activemq.store.kahadb.data.KahaAddScheduledJobCommand;
044import org.apache.activemq.store.kahadb.data.KahaDestroySchedulerCommand;
045import org.apache.activemq.store.kahadb.data.KahaRemoveScheduledJobCommand;
046import org.apache.activemq.store.kahadb.data.KahaRemoveScheduledJobsCommand;
047import org.apache.activemq.store.kahadb.data.KahaRescheduleJobCommand;
048import org.apache.activemq.store.kahadb.data.KahaTraceCommand;
049import org.apache.activemq.store.kahadb.disk.index.BTreeVisitor;
050import org.apache.activemq.store.kahadb.disk.journal.DataFile;
051import org.apache.activemq.store.kahadb.disk.journal.Location;
052import org.apache.activemq.store.kahadb.disk.page.Page;
053import org.apache.activemq.store.kahadb.disk.page.PageFile;
054import org.apache.activemq.store.kahadb.disk.page.Transaction;
055import org.apache.activemq.store.kahadb.disk.util.VariableMarshaller;
056import org.apache.activemq.store.kahadb.scheduler.legacy.LegacyStoreReplayer;
057import org.apache.activemq.util.ByteSequence;
058import org.apache.activemq.util.IOHelper;
059import org.slf4j.Logger;
060import org.slf4j.LoggerFactory;
061
062public class JobSchedulerStoreImpl extends AbstractKahaDBStore implements JobSchedulerStore {
063
064    private static final Logger LOG = LoggerFactory.getLogger(JobSchedulerStoreImpl.class);
065
066    private JobSchedulerKahaDBMetaData metaData = new JobSchedulerKahaDBMetaData(this);
067    private final MetaDataMarshaller metaDataMarshaller = new MetaDataMarshaller(this);
068    private final Map<String, JobSchedulerImpl> schedulers = new HashMap<String, JobSchedulerImpl>();
069    private File legacyStoreArchiveDirectory;
070
071    /**
072     * The Scheduler Token is used to identify base revisions of the Scheduler store.  A store
073     * based on the initial scheduler design will not have this tag in it's meta-data and will
074     * indicate an update is needed.  Later versions of the scheduler can also change this value
075     * to indicate incompatible store bases which require complete meta-data and journal rewrites
076     * instead of simpler meta-data updates.
077     */
078    static final UUID SCHEDULER_STORE_TOKEN = UUID.fromString("57ed642b-1ee3-47b3-be6d-b7297d500409");
079
080    /**
081     * The default scheduler store version.  All new store instance will be given this version and
082     * earlier versions will be updated to this version.
083     */
084    static final int CURRENT_VERSION = 1;
085
086    @Override
087    public JobScheduler getJobScheduler(final String name) throws Exception {
088        this.indexLock.writeLock().lock();
089        try {
090            JobSchedulerImpl result = this.schedulers.get(name);
091            if (result == null) {
092                final JobSchedulerImpl js = new JobSchedulerImpl(this);
093                js.setName(name);
094                getPageFile().tx().execute(new Transaction.Closure<IOException>() {
095                    @Override
096                    public void execute(Transaction tx) throws IOException {
097                        js.createIndexes(tx);
098                        js.load(tx);
099                        metaData.getJobSchedulers().put(tx, name, js);
100                    }
101                });
102                result = js;
103                this.schedulers.put(name, js);
104                if (isStarted()) {
105                    result.start();
106                }
107                this.pageFile.flush();
108            }
109            return result;
110        } finally {
111            this.indexLock.writeLock().unlock();
112        }
113    }
114
115    @Override
116    public boolean removeJobScheduler(final String name) throws Exception {
117        boolean result = false;
118
119        this.indexLock.writeLock().lock();
120        try {
121            final JobSchedulerImpl js = this.schedulers.remove(name);
122            result = js != null;
123            if (result) {
124                js.stop();
125                getPageFile().tx().execute(new Transaction.Closure<IOException>() {
126                    @Override
127                    public void execute(Transaction tx) throws IOException {
128                        metaData.getJobSchedulers().remove(tx, name);
129                        js.removeAll(tx);
130                    }
131                });
132            }
133        } finally {
134            this.indexLock.writeLock().unlock();
135        }
136        return result;
137    }
138
139    /**
140     * Sets the directory where the legacy scheduler store files are archived before an
141     * update attempt is made.  Both the legacy index files and the journal files are moved
142     * to this folder prior to an upgrade attempt.
143     *
144     * @param directory
145     *      The directory to move the legacy Scheduler Store files to.
146     */
147    public void setLegacyStoreArchiveDirectory(File directory) {
148        this.legacyStoreArchiveDirectory = directory;
149    }
150
151    /**
152     * Gets the directory where the legacy Scheduler Store files will be archived if the
153     * broker is started and an existing Job Scheduler Store from an old version is detected.
154     *
155     * @return the directory where scheduler store legacy files are archived on upgrade.
156     */
157    public File getLegacyStoreArchiveDirectory() {
158        if (this.legacyStoreArchiveDirectory == null) {
159            this.legacyStoreArchiveDirectory = new File(getDirectory(), "legacySchedulerStore");
160        }
161
162        return this.legacyStoreArchiveDirectory.getAbsoluteFile();
163    }
164
165    @Override
166    public void load() throws IOException {
167        if (opened.compareAndSet(false, true)) {
168            getJournal().start();
169            try {
170                loadPageFile();
171            } catch (UnknownStoreVersionException ex) {
172                LOG.info("Can't start until store update is performed.");
173                upgradeFromLegacy();
174                // Restart with the updated store
175                getJournal().start();
176                loadPageFile();
177                LOG.info("Update from legacy Scheduler store completed successfully.");
178            } catch (Throwable t) {
179                LOG.warn("Index corrupted. Recovering the index through journal replay. Cause: {}", t.toString());
180                LOG.debug("Index load failure", t);
181
182                // try to recover index
183                try {
184                    pageFile.unload();
185                } catch (Exception ignore) {
186                }
187                if (isArchiveCorruptedIndex()) {
188                    pageFile.archive();
189                } else {
190                    pageFile.delete();
191                }
192                metaData = new JobSchedulerKahaDBMetaData(this);
193                pageFile = null;
194                loadPageFile();
195            }
196            startCheckpoint();
197            recover();
198        }
199        LOG.info("{} started.", this);
200    }
201
202    @Override
203    public void unload() throws IOException {
204        if (opened.compareAndSet(true, false)) {
205            for (JobSchedulerImpl js : this.schedulers.values()) {
206                try {
207                    js.stop();
208                } catch (Exception e) {
209                    throw new IOException(e);
210                }
211            }
212            this.indexLock.writeLock().lock();
213            try {
214                if (pageFile != null && pageFile.isLoaded()) {
215                    metaData.setState(KahaDBMetaData.CLOSED_STATE);
216
217                    if (metaData.getPage() != null) {
218                        pageFile.tx().execute(new Transaction.Closure<IOException>() {
219                            @Override
220                            public void execute(Transaction tx) throws IOException {
221                                tx.store(metaData.getPage(), metaDataMarshaller, true);
222                            }
223                        });
224                    }
225                }
226            } finally {
227                this.indexLock.writeLock().unlock();
228            }
229
230            checkpointLock.writeLock().lock();
231            try {
232                if (metaData.getPage() != null) {
233                    checkpointUpdate(true);
234                }
235            } finally {
236                checkpointLock.writeLock().unlock();
237            }
238            synchronized (checkpointThreadLock) {
239                if (checkpointThread != null) {
240                    try {
241                        checkpointThread.join();
242                        checkpointThread = null;
243                    } catch (InterruptedException e) {
244                    }
245                }
246            }
247
248            if (pageFile != null) {
249                pageFile.unload();
250                pageFile = null;
251            }
252            if (this.journal != null) {
253                journal.close();
254                journal = null;
255            }
256
257            metaData = new JobSchedulerKahaDBMetaData(this);
258        }
259        LOG.info("{} stopped.", this);
260    }
261
262    private void loadPageFile() throws IOException {
263        this.indexLock.writeLock().lock();
264        try {
265            final PageFile pageFile = getPageFile();
266            pageFile.load();
267            pageFile.tx().execute(new Transaction.Closure<IOException>() {
268                @Override
269                public void execute(Transaction tx) throws IOException {
270                    if (pageFile.getPageCount() == 0) {
271                        Page<JobSchedulerKahaDBMetaData> page = tx.allocate();
272                        assert page.getPageId() == 0;
273                        page.set(metaData);
274                        metaData.setPage(page);
275                        metaData.setState(KahaDBMetaData.CLOSED_STATE);
276                        metaData.initialize(tx);
277                        tx.store(metaData.getPage(), metaDataMarshaller, true);
278                    } else {
279                        Page<JobSchedulerKahaDBMetaData> page = null;
280                        page = tx.load(0, metaDataMarshaller);
281                        metaData = page.get();
282                        metaData.setPage(page);
283                    }
284                    metaData.load(tx);
285                    metaData.loadScheduler(tx, schedulers);
286                    for (JobSchedulerImpl js : schedulers.values()) {
287                        try {
288                            js.start();
289                        } catch (Exception e) {
290                            JobSchedulerStoreImpl.LOG.error("Failed to load " + js.getName(), e);
291                        }
292                    }
293                }
294            });
295
296            pageFile.flush();
297        } finally {
298            this.indexLock.writeLock().unlock();
299        }
300    }
301
302    private void upgradeFromLegacy() throws IOException {
303
304        journal.close();
305        journal = null;
306        try {
307            pageFile.unload();
308            pageFile = null;
309        } catch (Exception ignore) {}
310
311        File storeDir = getDirectory().getAbsoluteFile();
312        File storeArchiveDir = getLegacyStoreArchiveDirectory();
313
314        LOG.info("Attempting to move old store files from {} to {}", storeDir, storeArchiveDir);
315
316        // Move only the known store files, locks and other items left in place.
317        IOHelper.moveFiles(storeDir, storeArchiveDir, new FilenameFilter() {
318
319            @Override
320            public boolean accept(File dir, String name) {
321                if (name.endsWith(".data") || name.endsWith(".redo") || name.endsWith(".log") || name.endsWith(".free")) {
322                    return true;
323                }
324                return false;
325            }
326        });
327
328        // We reset everything to clean state, then we can read from the old
329        // scheduler store and replay the scheduled jobs into this one as adds.
330        getJournal().start();
331        metaData = new JobSchedulerKahaDBMetaData(this);
332        pageFile = null;
333        loadPageFile();
334
335        LegacyStoreReplayer replayer = new LegacyStoreReplayer(getLegacyStoreArchiveDirectory());
336        replayer.load();
337        replayer.startReplay(this);
338
339        // Cleanup after replay and store what we've done.
340        pageFile.tx().execute(new Transaction.Closure<IOException>() {
341            @Override
342            public void execute(Transaction tx) throws IOException {
343                tx.store(metaData.getPage(), metaDataMarshaller, true);
344            }
345        });
346
347        checkpointUpdate(true);
348        getJournal().close();
349        getPageFile().unload();
350    }
351
352    @Override
353    protected void checkpointUpdate(Transaction tx, boolean cleanup) throws IOException {
354        LOG.debug("Job Scheduler Store Checkpoint started.");
355
356        // reflect last update exclusive of current checkpoint
357        Location lastUpdate = metaData.getLastUpdateLocation();
358        metaData.setState(KahaDBMetaData.OPEN_STATE);
359        tx.store(metaData.getPage(), metaDataMarshaller, true);
360        pageFile.flush();
361
362        if (cleanup) {
363            final TreeSet<Integer> completeFileSet = new TreeSet<Integer>(journal.getFileMap().keySet());
364            final TreeSet<Integer> gcCandidateSet = new TreeSet<Integer>(completeFileSet);
365
366            LOG.trace("Last update: {}, full gc candidates set: {}", lastUpdate, gcCandidateSet);
367
368            if (lastUpdate != null) {
369                gcCandidateSet.remove(lastUpdate.getDataFileId());
370            }
371
372            this.metaData.getJournalRC().visit(tx, new BTreeVisitor<Integer, Integer>() {
373
374                @Override
375                public void visit(List<Integer> keys, List<Integer> values) {
376                    for (Integer key : keys) {
377                        if (gcCandidateSet.remove(key)) {
378                            LOG.trace("Removed referenced file: {} from GC set", key);
379                        }
380                    }
381                }
382
383                @Override
384                public boolean isInterestedInKeysBetween(Integer first, Integer second) {
385                    return true;
386                }
387            });
388
389            LOG.trace("gc candidates after reference check: {}", gcCandidateSet);
390
391            // If there are GC candidates then check the remove command location to see
392            // if any of them can go or if they must stay in order to ensure proper recover.
393            //
394            // A log containing any remove commands must be kept until all the logs with the
395            // add commands for all the removed jobs have been dropped.
396            if (!gcCandidateSet.isEmpty()) {
397                Iterator<Entry<Integer, List<Integer>>> removals = metaData.getRemoveLocationTracker().iterator(tx);
398                List<Integer> orphans = new ArrayList<Integer>();
399                while (removals.hasNext()) {
400                    boolean orphanedRemove = true;
401                    Entry<Integer, List<Integer>> entry = removals.next();
402
403                    // If this log is not a GC candidate then there's no need to do a check to rule it out
404                    if (gcCandidateSet.contains(entry.getKey())) {
405                        for (Integer addLocation : entry.getValue()) {
406                            if (completeFileSet.contains(addLocation)) {
407                                LOG.trace("A remove in log {} has an add still in existance in {}.", entry.getKey(), addLocation);
408                                orphanedRemove = false;
409                                break;
410                            }
411                        }
412
413                        // If it's not orphaned than we can't remove it, otherwise we
414                        // stop tracking it it's log will get deleted on the next check.
415                        if (!orphanedRemove) {
416                            gcCandidateSet.remove(entry.getKey());
417                        } else {
418                            LOG.trace("All removes in log {} are orphaned, file can be GC'd", entry.getKey());
419                            orphans.add(entry.getKey());
420                        }
421                    }
422                }
423
424                // Drop all orphaned removes from the tracker.
425                for (Integer orphan : orphans) {
426                    metaData.getRemoveLocationTracker().remove(tx, orphan);
427                }
428            }
429
430            LOG.trace("gc candidates after removals check: {}", gcCandidateSet);
431            if (!gcCandidateSet.isEmpty()) {
432                if (LOG.isDebugEnabled()) {
433                    LOG.debug("Cleanup removing the data files: " + gcCandidateSet);
434                }
435                journal.removeDataFiles(gcCandidateSet);
436            }
437        }
438
439        LOG.debug("Job Scheduler Store Checkpoint complete.");
440    }
441
442    /**
443     * Adds a reference for the journal log file pointed to by the given Location value.
444     *
445     * To prevent log files in the journal that still contain valid data that needs to be
446     * kept in order to allow for recovery the logs must have active references.  Each Job
447     * scheduler should ensure that the logs are accurately referenced.
448     *
449     * @param tx
450     *      The TX under which the update is to be performed.
451     * @param location
452     *      The location value to update the reference count of.
453     *
454     * @throws IOException if an error occurs while updating the journal references table.
455     */
456    protected void incrementJournalCount(Transaction tx, Location location) throws IOException {
457        int logId = location.getDataFileId();
458        Integer val = metaData.getJournalRC().get(tx, logId);
459        int refCount = val != null ? val.intValue() + 1 : 1;
460        metaData.getJournalRC().put(tx, logId, refCount);
461    }
462
463    /**
464     * Removes one reference for the Journal log file indicated in the given Location value.
465     *
466     * The references are used to track which log files cannot be GC'd.  When the reference count
467     * on a log file reaches zero the file id is removed from the tracker and the log will be
468     * removed on the next check point update.
469     *
470     * @param tx
471     *      The TX under which the update is to be performed.
472     * @param location
473     *      The location value to update the reference count of.
474     *
475     * @throws IOException if an error occurs while updating the journal references table.
476     */
477    protected void decrementJournalCount(Transaction tx, Location location) throws IOException {
478        int logId = location.getDataFileId();
479        Integer refCount = metaData.getJournalRC().get(tx, logId);
480        if (refCount != null) {
481            int refCountValue = refCount;
482            refCountValue--;
483            if (refCountValue <= 0) {
484                metaData.getJournalRC().remove(tx, logId);
485            } else {
486                metaData.getJournalRC().put(tx, logId, refCountValue);
487            }
488        }
489    }
490
491    /**
492     * Removes multiple references for the Journal log file indicated in the given Location map.
493     *
494     * The references are used to track which log files cannot be GC'd.  When the reference count
495     * on a log file reaches zero the file id is removed from the tracker and the log will be
496     * removed on the next check point update.
497     *
498     * @param tx
499     *      The TX under which the update is to be performed.
500     * @param decrementsByFileIds
501     *      Map indicating how many decrements per fileId.
502     *
503     * @throws IOException if an error occurs while updating the journal references table.
504     */
505    protected void decrementJournalCount(Transaction tx, HashMap<Integer, Integer> decrementsByFileIds) throws IOException {
506        for(Map.Entry<Integer, Integer> entry : decrementsByFileIds.entrySet()) {
507            int logId = entry.getKey();
508            Integer refCount = metaData.getJournalRC().get(tx, logId);
509
510            if (refCount != null) {
511                int refCountValue = refCount;
512                refCountValue -= entry.getValue();
513                if (refCountValue <= 0) {
514                    metaData.getJournalRC().remove(tx, logId);
515                } else {
516                    metaData.getJournalRC().put(tx, logId, refCountValue);
517                }
518            }
519        }
520    }
521
522    /**
523     * Updates the Job removal tracking index with the location of a remove command and the
524     * original JobLocation entry.
525     *
526     * The JobLocation holds the locations in the logs where the add and update commands for
527     * a job stored.  The log file containing the remove command can only be discarded after
528     * both the add and latest update log files have also been discarded.
529     *
530     * @param tx
531     *      The TX under which the update is to be performed.
532     * @param location
533     *      The location value to reference a remove command.
534     * @param removedJob
535     *      The original JobLocation instance that holds the add and update locations
536     *
537     * @throws IOException if an error occurs while updating the remove location tracker.
538     */
539    protected void referenceRemovedLocation(Transaction tx, Location location, JobLocation removedJob) throws IOException {
540        int logId = location.getDataFileId();
541        List<Integer> removed = this.metaData.getRemoveLocationTracker().get(tx, logId);
542        if (removed == null) {
543            removed = new ArrayList<Integer>();
544        }
545        removed.add(removedJob.getLocation().getDataFileId());
546        this.metaData.getRemoveLocationTracker().put(tx, logId, removed);
547    }
548
549    /**
550     * Updates the Job removal tracking index with the location of a remove command and the
551     * original JobLocation entry.
552     *
553     * The JobLocation holds the locations in the logs where the add and update commands for
554     * a job stored.  The log file containing the remove command can only be discarded after
555     * both the add and latest update log files have also been discarded.
556     *
557     * @param tx
558     *      The TX under which the update is to be performed.
559     * @param location
560     *      The location value to reference a remove command.
561     * @param removedJobsFileId
562     *      List of the original JobLocation instances that holds the add and update locations
563     *
564     * @throws IOException if an error occurs while updating the remove location tracker.
565     */
566    protected void referenceRemovedLocation(Transaction tx, Location location, List<Integer> removedJobsFileId) throws IOException {
567        int logId = location.getDataFileId();
568        List<Integer> removed = this.metaData.getRemoveLocationTracker().get(tx, logId);
569        if (removed == null) {
570            removed = new ArrayList<Integer>();
571        }
572        removed.addAll(removedJobsFileId);
573        this.metaData.getRemoveLocationTracker().put(tx, logId, removed);
574    }
575
576    /**
577     * Retrieve the scheduled Job's byte blob from the journal.
578     *
579     * @param location
580     *      The location of the KahaAddScheduledJobCommand that originated the Job.
581     *
582     * @return a ByteSequence containing the payload of the scheduled Job.
583     *
584     * @throws IOException if an error occurs while reading the payload value.
585     */
586    protected ByteSequence getPayload(Location location) throws IOException {
587        KahaAddScheduledJobCommand job = (KahaAddScheduledJobCommand) this.load(location);
588        Buffer payload = job.getPayload();
589        return new ByteSequence(payload.getData(), payload.getOffset(), payload.getLength());
590    }
591
592    public void readLockIndex() {
593        this.indexLock.readLock().lock();
594    }
595
596    public void readUnlockIndex() {
597        this.indexLock.readLock().unlock();
598    }
599
600    public void writeLockIndex() {
601        this.indexLock.writeLock().lock();
602    }
603
604    public void writeUnlockIndex() {
605        this.indexLock.writeLock().unlock();
606    }
607
608    @Override
609    public String toString() {
610        return "JobSchedulerStore: " + getDirectory();
611    }
612
613    @Override
614    protected String getPageFileName() {
615        return "scheduleDB";
616    }
617
618    @Override
619    protected File getDefaultDataDirectory() {
620        return new File(IOHelper.getDefaultDataDirectory(), "delayedDB");
621    }
622
623    private class MetaDataMarshaller extends VariableMarshaller<JobSchedulerKahaDBMetaData> {
624
625        private final JobSchedulerStoreImpl store;
626
627        MetaDataMarshaller(JobSchedulerStoreImpl store) {
628            this.store = store;
629        }
630
631        @Override
632        public JobSchedulerKahaDBMetaData readPayload(DataInput dataIn) throws IOException {
633            JobSchedulerKahaDBMetaData rc = new JobSchedulerKahaDBMetaData(store);
634            rc.read(dataIn);
635            return rc;
636        }
637
638        @Override
639        public void writePayload(JobSchedulerKahaDBMetaData object, DataOutput dataOut) throws IOException {
640            object.write(dataOut);
641        }
642    }
643
644    /**
645     * Called during index recovery to rebuild the index from the last known good location.  For
646     * entries that occur before the last known good position we just ignore then and move on.
647     *
648     * @param command
649     *        the command read from the Journal which should be used to update the index.
650     * @param location
651     *        the location in the index where the command was read.
652     * @param inDoubtlocation
653     *        the location in the index known to be the last time the index was valid.
654     *
655     * @throws IOException if an error occurs while recovering the index.
656     */
657    protected void doRecover(JournalCommand<?> data, final Location location, final Location inDoubtlocation) throws IOException {
658        if (inDoubtlocation != null && location.compareTo(inDoubtlocation) >= 0) {
659            process(data, location);
660        }
661    }
662
663    /**
664     * Called during recovery to allow the store to rebuild from scratch.
665     *
666     * @param data
667     *      The command to process, which was read from the Journal.
668     * @param location
669     *      The location of the command in the Journal.
670     *
671     * @throws IOException if an error occurs during command processing.
672     */
673    @Override
674    protected void process(JournalCommand<?> data, final Location location) throws IOException {
675        data.visit(new Visitor() {
676            @Override
677            public void visit(final KahaAddScheduledJobCommand command) throws IOException {
678                final JobSchedulerImpl scheduler;
679
680                indexLock.writeLock().lock();
681                try {
682                    try {
683                        scheduler = (JobSchedulerImpl) getJobScheduler(command.getScheduler());
684                    } catch (Exception e) {
685                        throw new IOException(e);
686                    }
687                    getPageFile().tx().execute(new Transaction.Closure<IOException>() {
688                        @Override
689                        public void execute(Transaction tx) throws IOException {
690                            scheduler.process(tx, command, location);
691                        }
692                    });
693
694                    processLocation(location);
695                } finally {
696                    indexLock.writeLock().unlock();
697                }
698            }
699
700            @Override
701            public void visit(final KahaRemoveScheduledJobCommand command) throws IOException {
702                final JobSchedulerImpl scheduler;
703
704                indexLock.writeLock().lock();
705                try {
706                    try {
707                        scheduler = (JobSchedulerImpl) getJobScheduler(command.getScheduler());
708                    } catch (Exception e) {
709                        throw new IOException(e);
710                    }
711                    getPageFile().tx().execute(new Transaction.Closure<IOException>() {
712                        @Override
713                        public void execute(Transaction tx) throws IOException {
714                            scheduler.process(tx, command, location);
715                        }
716                    });
717
718                    processLocation(location);
719                } finally {
720                    indexLock.writeLock().unlock();
721                }
722            }
723
724            @Override
725            public void visit(final KahaRemoveScheduledJobsCommand command) throws IOException {
726                final JobSchedulerImpl scheduler;
727
728                indexLock.writeLock().lock();
729                try {
730                    try {
731                        scheduler = (JobSchedulerImpl) getJobScheduler(command.getScheduler());
732                    } catch (Exception e) {
733                        throw new IOException(e);
734                    }
735                    getPageFile().tx().execute(new Transaction.Closure<IOException>() {
736                        @Override
737                        public void execute(Transaction tx) throws IOException {
738                            scheduler.process(tx, command, location);
739                        }
740                    });
741
742                    processLocation(location);
743                } finally {
744                    indexLock.writeLock().unlock();
745                }
746            }
747
748            @Override
749            public void visit(final KahaRescheduleJobCommand command) throws IOException {
750                final JobSchedulerImpl scheduler;
751
752                indexLock.writeLock().lock();
753                try {
754                    try {
755                        scheduler = (JobSchedulerImpl) getJobScheduler(command.getScheduler());
756                    } catch (Exception e) {
757                        throw new IOException(e);
758                    }
759                    getPageFile().tx().execute(new Transaction.Closure<IOException>() {
760                        @Override
761                        public void execute(Transaction tx) throws IOException {
762                            scheduler.process(tx, command, location);
763                        }
764                    });
765
766                    processLocation(location);
767                } finally {
768                    indexLock.writeLock().unlock();
769                }
770            }
771
772            @Override
773            public void visit(final KahaDestroySchedulerCommand command) {
774                try {
775                    removeJobScheduler(command.getScheduler());
776                } catch (Exception e) {
777                    LOG.warn("Failed to remove scheduler: {}", command.getScheduler());
778                }
779
780                processLocation(location);
781            }
782
783            @Override
784            public void visit(KahaTraceCommand command) {
785                processLocation(location);
786            }
787        });
788    }
789
790    protected void processLocation(final Location location) {
791        indexLock.writeLock().lock();
792        try {
793            this.metaData.setLastUpdateLocation(location);
794        } finally {
795            indexLock.writeLock().unlock();
796        }
797    }
798
799    /**
800     * We recover from the Journal logs as needed to restore the index.
801     *
802     * @throws IllegalStateException
803     * @throws IOException
804     */
805    private void recover() throws IllegalStateException, IOException {
806        this.indexLock.writeLock().lock();
807        try {
808            long start = System.currentTimeMillis();
809            Location lastIndoubtPosition = getRecoveryPosition();
810            Location recoveryPosition = lastIndoubtPosition;
811
812            if (recoveryPosition != null) {
813                int redoCounter = 0;
814                LOG.info("Recovering from the scheduled job journal @" + recoveryPosition);
815                while (recoveryPosition != null) {
816                    try {
817                        JournalCommand<?> message = load(recoveryPosition);
818                        metaData.setLastUpdateLocation(recoveryPosition);
819                        doRecover(message, recoveryPosition, lastIndoubtPosition);
820                        redoCounter++;
821                    } catch (IOException failedRecovery) {
822                        if (isIgnoreMissingJournalfiles()) {
823                            LOG.debug("Failed to recover data at position:" + recoveryPosition, failedRecovery);
824                            // track this dud location
825                            journal.corruptRecoveryLocation(recoveryPosition);
826                        } else {
827                            throw new IOException("Failed to recover data at position:" + recoveryPosition, failedRecovery);
828                        }
829                    }
830                    recoveryPosition = journal.getNextLocation(recoveryPosition);
831                     if (LOG.isInfoEnabled() && redoCounter % 100000 == 0) {
832                         LOG.info("@ {}, {} entries recovered ..", recoveryPosition, redoCounter);
833                     }
834                }
835                long end = System.currentTimeMillis();
836                LOG.info("Recovery replayed {} operations from the journal in {} seconds.",
837                         redoCounter, ((end - start) / 1000.0f));
838            }
839
840            // We may have to undo some index updates.
841            pageFile.tx().execute(new Transaction.Closure<IOException>() {
842                @Override
843                public void execute(Transaction tx) throws IOException {
844                    recoverIndex(tx);
845                }
846            });
847
848        } finally {
849            this.indexLock.writeLock().unlock();
850        }
851    }
852
853    private Location getRecoveryPosition() throws IOException {
854        // This loads the first position and we completely rebuild the index if we
855        // do not override it with some known recovery start location.
856        Location result = null;
857
858        if (!isForceRecoverIndex()) {
859            if (metaData.getLastUpdateLocation() != null) {
860                result = metaData.getLastUpdateLocation();
861            }
862        }
863
864        return journal.getNextLocation(result);
865    }
866
867    private void recoverIndex(Transaction tx) throws IOException {
868        long start = System.currentTimeMillis();
869
870        // It is possible index updates got applied before the journal updates..
871        // in that case we need to removed references to Jobs that are not in the journal
872        final Location lastAppendLocation = journal.getLastAppendLocation();
873        long undoCounter = 0;
874
875        // Go through all the jobs in each scheduler and check if any are added after
876        // the last appended location and remove those.  For now we ignore the update
877        // location since the scheduled job will update itself after the next fire and
878        // a new update will replace any existing update.
879        for (Iterator<Map.Entry<String, JobSchedulerImpl>> i = metaData.getJobSchedulers().iterator(tx); i.hasNext();) {
880            Map.Entry<String, JobSchedulerImpl> entry = i.next();
881            JobSchedulerImpl scheduler = entry.getValue();
882
883            for (Iterator<JobLocation> jobLocationIterator = scheduler.getAllScheduledJobs(tx); jobLocationIterator.hasNext();) {
884                final JobLocation job = jobLocationIterator.next();
885                if (job.getLocation().compareTo(lastAppendLocation) >= 0) {
886                    if (scheduler.removeJobAtTime(tx, job.getJobId(), job.getNextTime())) {
887                        LOG.trace("Removed Job past last appened in the journal: {}", job.getJobId());
888                        undoCounter++;
889                    }
890                }
891            }
892        }
893
894        if (undoCounter > 0) {
895            // The rolled back operations are basically in flight journal writes.  To avoid getting
896            // these the end user should do sync writes to the journal.
897            long end = System.currentTimeMillis();
898            LOG.info("Rolled back {} messages from the index in {} seconds.", undoCounter, ((end - start) / 1000.0f));
899            undoCounter = 0;
900        }
901
902        // Now we check for missing and corrupt journal files.
903
904        // 1. Collect the set of all referenced journal files based on the Location of the
905        //    the scheduled jobs and the marked last update field.
906        HashSet<Integer> missingJournalFiles = new HashSet<Integer>();
907        for (Iterator<Map.Entry<String, JobSchedulerImpl>> i = metaData.getJobSchedulers().iterator(tx); i.hasNext();) {
908            Map.Entry<String, JobSchedulerImpl> entry = i.next();
909            JobSchedulerImpl scheduler = entry.getValue();
910
911            for (Iterator<JobLocation> jobLocationIterator = scheduler.getAllScheduledJobs(tx); jobLocationIterator.hasNext();) {
912                final JobLocation job = jobLocationIterator.next();
913                missingJournalFiles.add(job.getLocation().getDataFileId());
914                if (job.getLastUpdate() != null) {
915                    missingJournalFiles.add(job.getLastUpdate().getDataFileId());
916                }
917            }
918        }
919
920        // 2. Remove from that set all known data file Id's in the journal and what's left
921        //    is the missing set which will soon also contain the corrupted set.
922        missingJournalFiles.removeAll(journal.getFileMap().keySet());
923        if (!missingJournalFiles.isEmpty()) {
924            LOG.info("Some journal files are missing: {}", missingJournalFiles);
925        }
926
927        // 3. Now check all references in the journal logs for corruption and add any
928        //    corrupt journal files to the missing set.
929        HashSet<Location> corruptedLocations = new HashSet<Location>();
930
931        if (isCheckForCorruptJournalFiles()) {
932            Collection<DataFile> dataFiles = journal.getFileMap().values();
933            for (DataFile dataFile : dataFiles) {
934                int id = dataFile.getDataFileId();
935                for (long offset : dataFile.getCorruptedBlocks()) {
936                    corruptedLocations.add(new Location(id, (int) offset));
937                }
938            }
939
940            if (!corruptedLocations.isEmpty()) {
941                LOG.debug("Found some corrupted data blocks in the journal: {}", corruptedLocations.size());
942            }
943        }
944
945        // 4. Now we either fail or we remove all references to missing or corrupt journal
946        //    files from the various JobSchedulerImpl instances.  We only remove the Job if
947        //    the initial Add operation is missing when the ignore option is set, the updates
948        //    could be lost but that's price you pay when ignoring the missing logs.
949        if (!missingJournalFiles.isEmpty() || !corruptedLocations.isEmpty()) {
950            if (!isIgnoreMissingJournalfiles()) {
951                throw new IOException("Detected missing/corrupt journal files.");
952            }
953
954            // Remove all Jobs that reference an Location that is either missing or corrupt.
955            undoCounter = removeJobsInMissingOrCorruptJounralFiles(tx, missingJournalFiles, corruptedLocations);
956
957            // Clean up the Journal Reference count Map.
958            removeJournalRCForMissingFiles(tx, missingJournalFiles);
959        }
960
961        if (undoCounter > 0) {
962            long end = System.currentTimeMillis();
963            LOG.info("Detected missing/corrupt journal files.  Dropped {} jobs from the " +
964                     "index in {} seconds.", undoCounter, ((end - start) / 1000.0f));
965        }
966    }
967
968    private void removeJournalRCForMissingFiles(Transaction tx, Set<Integer> missing) throws IOException {
969        List<Integer> matches = new ArrayList<Integer>();
970
971        Iterator<Entry<Integer, Integer>> references = metaData.getJournalRC().iterator(tx);
972        while (references.hasNext()) {
973            int dataFileId = references.next().getKey();
974            if (missing.contains(dataFileId)) {
975                matches.add(dataFileId);
976            }
977        }
978
979        for (Integer match : matches) {
980            metaData.getJournalRC().remove(tx, match);
981        }
982    }
983
984    private int removeJobsInMissingOrCorruptJounralFiles(Transaction tx, Set<Integer> missing, Set<Location> corrupted) throws IOException {
985        int removed = 0;
986
987        // Remove Jobs that reference missing or corrupt files.
988        // Remove Reference counts to missing or corrupt files.
989        // Remove and remove command markers to missing or corrupt files.
990        for (Iterator<Map.Entry<String, JobSchedulerImpl>> i = metaData.getJobSchedulers().iterator(tx); i.hasNext();) {
991            Map.Entry<String, JobSchedulerImpl> entry = i.next();
992            JobSchedulerImpl scheduler = entry.getValue();
993
994            for (Iterator<JobLocation> jobLocationIterator = scheduler.getAllScheduledJobs(tx); jobLocationIterator.hasNext();) {
995                final JobLocation job = jobLocationIterator.next();
996
997                // Remove all jobs in missing log files.
998                if (missing.contains(job.getLocation().getDataFileId())) {
999                    scheduler.removeJobAtTime(tx, job.getJobId(), job.getNextTime());
1000                    removed++;
1001                    continue;
1002                }
1003
1004                // Remove all jobs in corrupted parts of log files.
1005                if (corrupted.contains(job.getLocation())) {
1006                    scheduler.removeJobAtTime(tx, job.getJobId(), job.getNextTime());
1007                    removed++;
1008                }
1009            }
1010        }
1011
1012        return removed;
1013    }
1014}